seanpedrickcase commited on
Commit
3bff849
·
1 Parent(s): 601fcda

Updated command line redaction script with more options

Browse files
Dockerfile CHANGED
@@ -101,7 +101,7 @@ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_
101
  && chmod 755 \
102
  ${APP_HOME}/.local/share/spacy/data \
103
  mkdir -p /usr/share/tessdata && \
104
- chmod 755 /usr/share/tessdata # Create tessdata directory and set permissions
105
 
106
  # Copy installed packages from builder stage
107
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 
101
  && chmod 755 \
102
  ${APP_HOME}/.local/share/spacy/data \
103
  mkdir -p /usr/share/tessdata && \
104
+ chmod 755 /usr/share/tessdata
105
 
106
  # Copy installed packages from builder stage
107
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
tools/cli_redact.py CHANGED
@@ -1,84 +1,164 @@
1
  import argparse
2
  import os
3
- from tools.config import get_or_create_env_var
4
- from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 
5
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
6
  from tools.file_redaction import choose_and_run_redactor
7
- import pandas as pd
8
- from datetime import datetime
9
-
10
- chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
11
- 'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
12
- 'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
13
- 'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
14
- 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
15
- 'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
16
- chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
17
- "STREETNAME", "UKPOSTCODE"]
18
-
19
- def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
20
- log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
21
- current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
22
-
23
- if output_file_list is None:
24
- output_file_list = []
25
- if log_files_list is None:
26
- log_files_list = []
27
 
28
- parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
29
-
30
- # Required arguments
31
- parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
32
-
33
- # Optional arguments with defaults matching the GUI app
34
- parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
35
- default='Quick image analysis', help='OCR method to use')
36
- parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
37
- default='Local', help='PII detection method')
38
- parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
39
- parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
40
- parser.add_argument('--allow_list', help='Path to allow list CSV file')
41
- parser.add_argument('--output_dir', default='output/', help='Output directory')
42
 
43
- args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- # Ensure output directory exists
46
- ensure_output_folder_exists()
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Create file object similar to what Gradio provides
49
- file_obj = {"name": args.input_file}
 
 
 
 
 
 
 
 
50
 
51
- # Load allow list if provided
52
- allow_list_df = pd.DataFrame()
53
- if args.allow_list:
54
- allow_list_df = pd.read_csv(args.allow_list)
 
 
 
55
 
56
- # Get file names
57
- file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
58
 
59
- # Initialize empty states for PDF processing
 
 
 
60
 
61
- # Prepare PDF/image
62
- output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
63
- file_obj, args.ocr_method, allow_list_df, latest_file_completed,
64
- output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
65
- )
66
-
67
- output_summary, output_files, output_file_list, latest_file_completed, log_files, \
68
- log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
69
- current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
70
- comprehend_query_num = choose_and_run_redactor(
71
- file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
72
- chosen_comprehend_entities, args.ocr_method, allow_list_df,
73
- latest_file_completed, output_summary, output_file_list, log_files_list,
74
- first_loop_state, args.page_min, args.page_max, estimated_time,
75
- handwrite_signature_checkbox, textract_metadata, all_image_annotations,
76
- all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
77
- current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
81
- print(f"\nOutput files saved to: {args.output_dir}")
 
 
 
 
 
82
 
83
  if __name__ == "__main__":
84
- main()
 
1
  import argparse
2
  import os
3
+ import pandas as pd
4
+ from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
5
+ from tools.helper_functions import ensure_output_folder_exists
6
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
7
  from tools.file_redaction import choose_and_run_redactor
8
+ from tools.anonymisation import anonymise_files_with_open_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # --- Constants and Configuration ---
11
+ INPUT_FOLDER = 'input/'
12
+ OUTPUT_FOLDER = 'output/'
13
+ DEFAULT_LANGUAGE = 'en'
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Define entities for redaction
16
+ chosen_comprehend_entities = [
17
+ 'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
18
+ 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
19
+ 'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
20
+ 'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
21
+ 'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
22
+ 'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
23
+ ]
24
+ chosen_redact_entities = [
25
+ "TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
26
+ ]
27
+
28
+ # --- Main CLI Function ---
29
+ def main():
30
+ """
31
+ A unified command-line interface to prepare, redact, and anonymise various document types.
32
+ """
33
+ parser = argparse.ArgumentParser(
34
+ description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
35
+ formatter_class=argparse.RawTextHelpFormatter
36
+ )
37
 
38
+ # --- General Arguments (apply to all file types) ---
39
+ general_group = parser.add_argument_group('General Options')
40
+ general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
41
+ general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
42
+ general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
43
+ general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
44
+ general_group.add_argument('--pii_detector',
45
+ choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
46
+ default=LOCAL_PII_OPTION,
47
+ help='Core PII detection method (Local or AWS).')
48
+ general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
49
+ general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
50
 
51
+ # --- PDF/Image Redaction Arguments ---
52
+ pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
53
+ pdf_group.add_argument('--ocr_method',
54
+ choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
55
+ default=TESSERACT_TEXT_EXTRACT_OPTION,
56
+ help='OCR method for text extraction from images.')
57
+ pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
58
+ pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
59
+ pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
60
+ pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
61
 
62
+ # --- Word/Tabular Anonymisation Arguments ---
63
+ tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
64
+ tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
65
+ tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
66
+ tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
67
+ tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
68
+ tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
69
 
70
+ args = parser.parse_args()
 
71
 
72
+ # --- Initial Setup ---
73
+ ensure_output_folder_exists(args.output_dir)
74
+ _, file_extension = os.path.splitext(args.input_file)
75
+ file_extension = file_extension.lower()
76
 
77
+ # Load allow/deny lists
78
+ allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
79
+ deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
80
+
81
+
82
+ # --- Route to the Correct Workflow Based on File Type ---
83
+
84
+ # Workflow 1: PDF/Image Redaction
85
+ if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
86
+ print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
87
+ try:
88
+ # Step 1: Prepare the document
89
+ print("\nStep 1: Preparing document...")
90
+ (
91
+ prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
92
+ image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
93
+ ) = prepare_image_or_pdf(
94
+ file_paths=[args.input_file], text_extract_method=args.ocr_method,
95
+ all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
96
+ first_loop_state=True, prepare_for_review=args.prepare_for_review,
97
+ output_folder=args.output_dir, prepare_images=args.prepare_images
98
+ )
99
+ print(f"Preparation complete. {prep_summary}")
100
+
101
+ # Step 2: Redact the prepared document
102
+ print("\nStep 2: Running redaction...")
103
+ (
104
+ output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
105
+ ) = choose_and_run_redactor(
106
+ file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
107
+ pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
108
+ chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
109
+ in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
110
+ pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
111
+ document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
112
+ aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
113
+ language=args.language, output_folder=args.output_dir
114
+ )
115
+
116
+ print("\n--- Redaction Process Complete ---")
117
+ print(f"Summary: {output_summary}")
118
+ print(f"\nOutput files saved to: {args.output_dir}")
119
+ print("Generated Files:", sorted(output_files))
120
+ if log_files: print("Log Files:", sorted(log_files))
121
+
122
+ except Exception as e:
123
+ print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
124
+
125
+ # Workflow 2: Word/Tabular Data Anonymisation
126
+ elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
127
+ print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
128
+ try:
129
+ # Run the anonymisation function directly
130
+ output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
131
+ file_paths=[args.input_file],
132
+ in_text="", # Not used for file-based operations
133
+ anon_strat=args.anon_strat,
134
+ chosen_cols=args.columns,
135
+ chosen_redact_entities=chosen_redact_entities,
136
+ in_allow_list=allow_list,
137
+ in_excel_sheets=args.excel_sheets,
138
+ first_loop_state=True,
139
+ output_folder=args.output_dir,
140
+ in_deny_list=deny_list,
141
+ max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
142
+ pii_identification_method=args.pii_detector,
143
+ chosen_redact_comprehend_entities=chosen_comprehend_entities,
144
+ aws_access_key_textbox=args.aws_access_key,
145
+ aws_secret_key_textbox=args.aws_secret_key,
146
+ language=args.language
147
+ )
148
+
149
+ print("\n--- Anonymisation Process Complete ---")
150
+ print(f"Summary: {output_summary}")
151
+ print(f"\nOutput files saved to: {args.output_dir}")
152
+ print("Generated Files:", sorted(output_files))
153
+ if log_files: print("Log Files:", sorted(log_files))
154
 
155
+ except Exception as e:
156
+ print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
157
+
158
+ else:
159
+ print(f"Error: Unsupported file type '{file_extension}'.")
160
+ print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
161
+ print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
162
 
163
  if __name__ == "__main__":
164
+ main()
tools/custom_image_analyser_engine.py CHANGED
@@ -696,8 +696,8 @@ class CustomImageAnalyzerEngine:
696
  ) -> List[CustomImageRecognizerResult]:
697
 
698
  page_text = ""
699
- page_text_mapping = []
700
- all_text_line_results = []
701
  comprehend_query_number = 0
702
  print("custom_entities:", custom_entities)
703
 
@@ -774,13 +774,13 @@ class CustomImageAnalyzerEngine:
774
 
775
  # Process text in batches for AWS Comprehend
776
  current_batch = ""
777
- current_batch_mapping = []
778
  batch_char_count = 0
779
  batch_word_count = 0
780
 
781
  for i, text_line in enumerate(line_level_ocr_results):
782
  words = text_line.text.split()
783
- word_start_positions = []
784
  current_pos = 0
785
 
786
  for word in words:
@@ -839,7 +839,7 @@ class CustomImageAnalyzerEngine:
839
  comprehend_query_number += 1
840
 
841
  # Process results and create bounding boxes
842
- combined_results = []
843
  for i, text_line in enumerate(line_level_ocr_results):
844
  line_results = next((results for idx, results in all_text_line_results if idx == i), [])
845
  if line_results and i < len(ocr_results_with_words):
@@ -872,7 +872,7 @@ class CustomImageAnalyzerEngine:
872
  allow_list: List[str],
873
  ocr_results_with_words_child_info: Dict[str, Dict]
874
  ) -> List[CustomImageRecognizerResult]:
875
- redaction_bboxes = []
876
 
877
  for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
878
  #print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
@@ -895,7 +895,7 @@ class CustomImageAnalyzerEngine:
895
  matched_words = matched_text.split()
896
 
897
  # Find the corresponding words in the OCR results
898
- matching_word_boxes = []
899
 
900
  current_position = 0
901
 
@@ -1236,13 +1236,13 @@ def run_page_text_redaction(
1236
  )
1237
 
1238
  current_batch = ""
1239
- current_batch_mapping = []
1240
  batch_char_count = 0
1241
  batch_word_count = 0
1242
 
1243
  for i, text_line in enumerate(line_level_text_results_list):
1244
  words = text_line.text.split()
1245
- word_start_positions = []
1246
 
1247
  # Calculate word start positions within the line
1248
  current_pos = 0
@@ -1320,12 +1320,12 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
1320
  '''
1321
  Merge identified bounding boxes containing PII that are very close to one another
1322
  '''
1323
- analysed_bounding_boxes = []
1324
- original_bounding_boxes = [] # List to hold original bounding boxes
1325
 
1326
  if len(analyser_results) > 0 and len(characters) > 0:
1327
  # Extract bounding box coordinates for sorting
1328
- bounding_boxes = []
1329
  for result in analyser_results:
1330
  #print("Result:", result)
1331
  char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
@@ -1346,11 +1346,11 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
1346
  # Sort the results by y-coordinate and then by x-coordinate
1347
  bounding_boxes.sort()
1348
 
1349
- merged_bounding_boxes = []
1350
  current_box = None
1351
  current_y = None
1352
  current_result = None
1353
- current_text = []
1354
 
1355
  for y, x, result, next_box, text in bounding_boxes:
1356
  if current_y is None or current_box is None:
@@ -1406,7 +1406,7 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
1406
  return analysed_bounding_boxes
1407
 
1408
  def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
1409
- reconstructed_results = []
1410
 
1411
  # Assume all lines belong to the same page, so we can just read it from one item
1412
  #page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
@@ -1445,7 +1445,7 @@ def split_words_and_punctuation_from_line(line_of_words: List[OCRResult]) -> Lis
1445
  # Punctuation that will be split off. Hyphen is not included.
1446
  PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
1447
 
1448
- new_word_list = []
1449
 
1450
  for word_result in line_of_words:
1451
  word_text = word_result.text
@@ -1528,8 +1528,8 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
1528
  if not ocr_results:
1529
  return {"page": page, "results": []}, {"page": page, "results": {}}
1530
 
1531
- lines = []
1532
- current_line = []
1533
  for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
1534
  if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
1535
  current_line.append(result)
@@ -1539,7 +1539,7 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
1539
  if current_line:
1540
  lines.append(sorted(current_line, key=lambda x: x.left))
1541
 
1542
- page_line_level_ocr_results = []
1543
  page_line_level_ocr_results_with_words = {}
1544
  line_counter = 1
1545
 
 
696
  ) -> List[CustomImageRecognizerResult]:
697
 
698
  page_text = ""
699
+ page_text_mapping = list()
700
+ all_text_line_results = list()
701
  comprehend_query_number = 0
702
  print("custom_entities:", custom_entities)
703
 
 
774
 
775
  # Process text in batches for AWS Comprehend
776
  current_batch = ""
777
+ current_batch_mapping = list()
778
  batch_char_count = 0
779
  batch_word_count = 0
780
 
781
  for i, text_line in enumerate(line_level_ocr_results):
782
  words = text_line.text.split()
783
+ word_start_positions = list()
784
  current_pos = 0
785
 
786
  for word in words:
 
839
  comprehend_query_number += 1
840
 
841
  # Process results and create bounding boxes
842
+ combined_results = list()
843
  for i, text_line in enumerate(line_level_ocr_results):
844
  line_results = next((results for idx, results in all_text_line_results if idx == i), [])
845
  if line_results and i < len(ocr_results_with_words):
 
872
  allow_list: List[str],
873
  ocr_results_with_words_child_info: Dict[str, Dict]
874
  ) -> List[CustomImageRecognizerResult]:
875
+ redaction_bboxes = list()
876
 
877
  for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
878
  #print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
 
895
  matched_words = matched_text.split()
896
 
897
  # Find the corresponding words in the OCR results
898
+ matching_word_boxes = list()
899
 
900
  current_position = 0
901
 
 
1236
  )
1237
 
1238
  current_batch = ""
1239
+ current_batch_mapping = list()
1240
  batch_char_count = 0
1241
  batch_word_count = 0
1242
 
1243
  for i, text_line in enumerate(line_level_text_results_list):
1244
  words = text_line.text.split()
1245
+ word_start_positions = list()
1246
 
1247
  # Calculate word start positions within the line
1248
  current_pos = 0
 
1320
  '''
1321
  Merge identified bounding boxes containing PII that are very close to one another
1322
  '''
1323
+ analysed_bounding_boxes = list()
1324
+ original_bounding_boxes = list() # List to hold original bounding boxes
1325
 
1326
  if len(analyser_results) > 0 and len(characters) > 0:
1327
  # Extract bounding box coordinates for sorting
1328
+ bounding_boxes = list()
1329
  for result in analyser_results:
1330
  #print("Result:", result)
1331
  char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
 
1346
  # Sort the results by y-coordinate and then by x-coordinate
1347
  bounding_boxes.sort()
1348
 
1349
+ merged_bounding_boxes = list()
1350
  current_box = None
1351
  current_y = None
1352
  current_result = None
1353
+ current_text = list()
1354
 
1355
  for y, x, result, next_box, text in bounding_boxes:
1356
  if current_y is None or current_box is None:
 
1406
  return analysed_bounding_boxes
1407
 
1408
  def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
1409
+ reconstructed_results = list()
1410
 
1411
  # Assume all lines belong to the same page, so we can just read it from one item
1412
  #page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
 
1445
  # Punctuation that will be split off. Hyphen is not included.
1446
  PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
1447
 
1448
+ new_word_list = list()
1449
 
1450
  for word_result in line_of_words:
1451
  word_text = word_result.text
 
1528
  if not ocr_results:
1529
  return {"page": page, "results": []}, {"page": page, "results": {}}
1530
 
1531
+ lines = list()
1532
+ current_line = list()
1533
  for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
1534
  if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
1535
  current_line.append(result)
 
1539
  if current_line:
1540
  lines.append(sorted(current_line, key=lambda x: x.left))
1541
 
1542
+ page_line_level_ocr_results = list()
1543
  page_line_level_ocr_results_with_words = {}
1544
  line_counter = 1
1545
 
tools/data_anonymise.py CHANGED
@@ -327,7 +327,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
327
  This function anonymises data files based on the provided parameters.
328
 
329
  Parameters:
330
- - file_paths (List[str]): A list of file paths to anonymise.
331
  - in_text (str): The text to anonymise if file_paths is 'open_text'.
332
  - anon_strat (str): The anonymisation strategy to use.
333
  - chosen_cols (List[str]): A list of column names to anonymise.
 
327
  This function anonymises data files based on the provided parameters.
328
 
329
  Parameters:
330
+ - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
331
  - in_text (str): The text to anonymise if file_paths is 'open_text'.
332
  - anon_strat (str): The anonymisation strategy to use.
333
  - chosen_cols (List[str]): A list of column names to anonymise.
tools/example_cli_calls.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python cli_redact.py --help
2
+
3
+ python cli_redact.py \
4
+ --input_file "documents/confidential-report.pdf" \
5
+ --output_dir "output/redacted_reports/" \
6
+ --ocr_method "Local OCR model - PDFs without selectable text" \
7
+ --pii_detector "Local" \
8
+ --page_min 2 \
9
+ --page_max 10 \
10
+ --allow_list "config/project_allowlist.csv"
11
+
tools/file_conversion.py CHANGED
@@ -72,7 +72,7 @@ def check_image_size_and_reduce(out_path:str, image:Image):
72
  Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
73
  '''
74
 
75
- all_img_details = []
76
  page_num = 0
77
 
78
  # Check file size and resize if necessary
@@ -168,9 +168,9 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
168
  # Set page max to length of pdf if not specified
169
  if page_max == 0: page_max = page_count
170
 
171
- results = []
172
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
173
- futures = []
174
  for page_num in range(page_min, page_max):
175
  futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
176
 
@@ -222,10 +222,10 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
222
 
223
  else:
224
  print(f"{file_path} is not an image or PDF file.")
225
- img_path = []
226
- image_sizes_width = []
227
- image_sizes_height = []
228
- all_img_details = []
229
 
230
  return img_path, image_sizes_width, image_sizes_height, all_img_details
231
 
@@ -234,7 +234,7 @@ def get_input_file_names(file_input:List[str]):
234
  Get list of input files to report to logs.
235
  '''
236
 
237
- all_relevant_files = []
238
  file_name_with_extension = ""
239
  full_file_name = ""
240
  total_pdf_page_count = 0
@@ -419,8 +419,8 @@ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, cu
419
  return whole_page_img_annotation_box
420
 
421
  def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
422
- page_sizes = []
423
- original_cropboxes = []
424
 
425
  for page_no, page in enumerate(pymupdf_doc):
426
  reported_page_no = page_no + 1
@@ -443,9 +443,6 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
443
  out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
444
 
445
  # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
446
- # MediaBox top y = mediabox.y1
447
- # CropBox top y = cropbox.y1
448
- # The difference is mediabox.y1 - cropbox.y1
449
  out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
450
 
451
  if image_sizes_width and image_sizes_height:
@@ -460,7 +457,7 @@ def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
460
  '''
461
  Convert a json of ocr results to a dataframe
462
  '''
463
- rows = []
464
  ocr_result_page = ocr_results[0]
465
 
466
  for ocr_result in ocr_results:
@@ -540,11 +537,11 @@ def prepare_image_or_pdf(
540
 
541
  tic = time.perf_counter()
542
  json_from_csv = False
543
- original_cropboxes = [] # Store original CropBox values
544
- converted_file_paths = []
545
- image_file_paths = []
546
- # pymupdf_doc = []
547
- all_img_details = []
548
  review_file_csv = pd.DataFrame()
549
  out_textract_path = ""
550
  combined_out_message = ""
@@ -557,15 +554,15 @@ def prepare_image_or_pdf(
557
  # If this is the first time around, set variables to 0/blank
558
  if first_loop_state==True:
559
  latest_file_completed = 0
560
- out_message = []
561
- all_annotations_object = []
562
  else:
563
  print("Now redacting file", str(latest_file_completed))
564
 
565
  # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
566
  if isinstance(out_message, str): out_message = [out_message]
567
 
568
- if not file_paths: file_paths = []
569
 
570
  if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
571
 
@@ -595,8 +592,8 @@ def prepare_image_or_pdf(
595
 
596
  # Loop through files to load in
597
  for file in file_paths_loop:
598
- converted_file_path = []
599
- image_file_path = []
600
 
601
  if isinstance(file, str):
602
  file_path = file
@@ -631,12 +628,12 @@ def prepare_image_or_pdf(
631
 
632
  #Create base version of the annotation object that doesn't have any annotations in it
633
  if (not all_annotations_object) & (prepare_for_review == True):
634
- all_annotations_object = []
635
 
636
  for image_path in image_file_paths:
637
  annotation = {}
638
  annotation["image"] = image_path
639
- annotation["boxes"] = []
640
 
641
  all_annotations_object.append(annotation)
642
 
@@ -826,29 +823,6 @@ def prepare_image_or_pdf(
826
  else:
827
  print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
828
 
829
- # elif file_extension in ['.csv'] and "ocr_output" in file_path:
830
- # continue
831
-
832
- # Must be something else, return with error message
833
- # else:
834
- # if prepare_for_review == False:
835
- # if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
836
- # if is_pdf_or_image(file_path) == False:
837
- # out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
838
- # print(out_message)
839
- # raise Exception(out_message)
840
-
841
- # else:# text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
842
- # if is_pdf(file_path) == False:
843
- # out_message = "Please upload a PDF file for text analysis."
844
- # print(out_message)
845
- # raise Exception(out_message)
846
- # else:
847
- # message = f"File {file_name_with_ext} not a recognised type for review, skipping"
848
- # print(message)
849
- # gr.Info(message)
850
- # continue
851
-
852
  converted_file_paths.append(converted_file_path)
853
  image_file_paths.extend(image_file_path)
854
 
@@ -966,7 +940,7 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
966
  image_groups[item['image']].append(item)
967
 
968
  # Process each group to prioritize items with non-empty boxes
969
- result = []
970
  for image, items in image_groups.items():
971
  # Filter items with non-empty boxes
972
  non_empty_boxes = [item for item in items if item.get('boxes')]
@@ -1496,7 +1470,7 @@ def create_annotation_dicts_from_annotation_df(
1496
  def convert_annotation_json_to_review_df(
1497
  all_annotations: List[dict],
1498
  redaction_decision_output: pd.DataFrame = pd.DataFrame(),
1499
- page_sizes: List[dict] = [],
1500
  do_proximity_match: bool = True
1501
  ) -> pd.DataFrame:
1502
  '''
@@ -2021,7 +1995,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
2021
  # --- Generate Unique IDs ---
2022
  character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
2023
  generated_ids_set = set() # Keep track of IDs generated *in this run*
2024
- new_ids_list = [] # Store the generated IDs in order
2025
 
2026
  max_possible_ids = len(character_set) ** length
2027
  if num_needed > max_possible_ids:
@@ -2228,14 +2202,14 @@ def convert_review_df_to_annotation_json(
2228
 
2229
 
2230
  # --- Build JSON Structure ---
2231
- json_data = []
2232
  output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
2233
 
2234
  # Iterate through page_sizes_df to define the structure (one entry per image path)
2235
  for _, row in page_sizes_df.iterrows():
2236
  page_num = row['page'] # Already Int64
2237
  pdf_image_path = row['image_path']
2238
- annotation_boxes = [] # Default to empty list
2239
 
2240
  # Check if the page exists in the grouped annotations (using the faster set lookup)
2241
  # Check pd.notna because page_num could be <NA> if conversion failed
@@ -2254,7 +2228,7 @@ def convert_review_df_to_annotation_json(
2254
 
2255
  except KeyError:
2256
  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
2257
- annotation_boxes = [] # Keep empty
2258
 
2259
  # Append the structured data for this image/page
2260
  json_data.append({
 
72
  Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
73
  '''
74
 
75
+ all_img_details = list()
76
  page_num = 0
77
 
78
  # Check file size and resize if necessary
 
168
  # Set page max to length of pdf if not specified
169
  if page_max == 0: page_max = page_count
170
 
171
+ results = list()
172
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
173
+ futures = list()
174
  for page_num in range(page_min, page_max):
175
  futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
176
 
 
222
 
223
  else:
224
  print(f"{file_path} is not an image or PDF file.")
225
+ img_path = list()
226
+ image_sizes_width = list()
227
+ image_sizes_height = list()
228
+ all_img_details = list()
229
 
230
  return img_path, image_sizes_width, image_sizes_height, all_img_details
231
 
 
234
  Get list of input files to report to logs.
235
  '''
236
 
237
+ all_relevant_files = list()
238
  file_name_with_extension = ""
239
  full_file_name = ""
240
  total_pdf_page_count = 0
 
419
  return whole_page_img_annotation_box
420
 
421
  def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
422
+ page_sizes = list()
423
+ original_cropboxes = list()
424
 
425
  for page_no, page in enumerate(pymupdf_doc):
426
  reported_page_no = page_no + 1
 
443
  out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
444
 
445
  # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
 
 
 
446
  out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
447
 
448
  if image_sizes_width and image_sizes_height:
 
457
  '''
458
  Convert a json of ocr results to a dataframe
459
  '''
460
+ rows = list()
461
  ocr_result_page = ocr_results[0]
462
 
463
  for ocr_result in ocr_results:
 
537
 
538
  tic = time.perf_counter()
539
  json_from_csv = False
540
+ original_cropboxes = list() # Store original CropBox values
541
+ converted_file_paths = list()
542
+ image_file_paths = list()
543
+ # pymupdf_doc = list()
544
+ all_img_details = list()
545
  review_file_csv = pd.DataFrame()
546
  out_textract_path = ""
547
  combined_out_message = ""
 
554
  # If this is the first time around, set variables to 0/blank
555
  if first_loop_state==True:
556
  latest_file_completed = 0
557
+ out_message = list()
558
+ all_annotations_object = list()
559
  else:
560
  print("Now redacting file", str(latest_file_completed))
561
 
562
  # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
563
  if isinstance(out_message, str): out_message = [out_message]
564
 
565
+ if not file_paths: file_paths = list()
566
 
567
  if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
568
 
 
592
 
593
  # Loop through files to load in
594
  for file in file_paths_loop:
595
+ converted_file_path = list()
596
+ image_file_path = list()
597
 
598
  if isinstance(file, str):
599
  file_path = file
 
628
 
629
  #Create base version of the annotation object that doesn't have any annotations in it
630
  if (not all_annotations_object) & (prepare_for_review == True):
631
+ all_annotations_object = list()
632
 
633
  for image_path in image_file_paths:
634
  annotation = {}
635
  annotation["image"] = image_path
636
+ annotation["boxes"] = list()
637
 
638
  all_annotations_object.append(annotation)
639
 
 
823
  else:
824
  print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
  converted_file_paths.append(converted_file_path)
827
  image_file_paths.extend(image_file_path)
828
 
 
940
  image_groups[item['image']].append(item)
941
 
942
  # Process each group to prioritize items with non-empty boxes
943
+ result = list()
944
  for image, items in image_groups.items():
945
  # Filter items with non-empty boxes
946
  non_empty_boxes = [item for item in items if item.get('boxes')]
 
1470
  def convert_annotation_json_to_review_df(
1471
  all_annotations: List[dict],
1472
  redaction_decision_output: pd.DataFrame = pd.DataFrame(),
1473
+ page_sizes: List[dict] = list(),
1474
  do_proximity_match: bool = True
1475
  ) -> pd.DataFrame:
1476
  '''
 
1995
  # --- Generate Unique IDs ---
1996
  character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
1997
  generated_ids_set = set() # Keep track of IDs generated *in this run*
1998
+ new_ids_list = list() # Store the generated IDs in order
1999
 
2000
  max_possible_ids = len(character_set) ** length
2001
  if num_needed > max_possible_ids:
 
2202
 
2203
 
2204
  # --- Build JSON Structure ---
2205
+ json_data = list()
2206
  output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
2207
 
2208
  # Iterate through page_sizes_df to define the structure (one entry per image path)
2209
  for _, row in page_sizes_df.iterrows():
2210
  page_num = row['page'] # Already Int64
2211
  pdf_image_path = row['image_path']
2212
+ annotation_boxes = list() # Default to empty list
2213
 
2214
  # Check if the page exists in the grouped annotations (using the faster set lookup)
2215
  # Check pd.notna because page_num could be <NA> if conversion failed
 
2228
 
2229
  except KeyError:
2230
  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
2231
+ annotation_boxes = list() # Keep empty
2232
 
2233
  # Append the structured data for this image/page
2234
  json_data.append({
tools/file_redaction.py CHANGED
@@ -201,7 +201,7 @@ def choose_and_run_redactor(file_paths:List[str],
201
  pdf_file_name_with_ext = ""
202
  pdf_file_name_without_ext = ""
203
  page_break_return = False
204
- blank_request_metadata = []
205
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
206
  review_out_file_paths = [prepared_pdf_file_paths[0]]
207
 
@@ -387,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
387
  if not in_allow_list.empty:
388
  in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
389
  else:
390
- in_allow_list_flat = []
391
 
392
  # If string, assume file path
393
  if isinstance(custom_recogniser_word_list, str):
@@ -396,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
396
  if not custom_recogniser_word_list.empty:
397
  custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
398
  else:
399
- custom_recogniser_word_list_flat = []
400
 
401
  # Sort the strings in order from the longest string to the shortest
402
  custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
@@ -412,7 +412,7 @@ def choose_and_run_redactor(file_paths:List[str],
412
  print("Could not convert whole page redaction data to number list due to:", e)
413
  redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
414
  else:
415
- redact_whole_page_list_flat = []
416
 
417
 
418
 
@@ -1100,7 +1100,7 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
1100
  else:
1101
  page.set_cropbox(original_cropbox)
1102
 
1103
- def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
1104
 
1105
  rect_height = page.rect.height
1106
  rect_width = page.rect.width
@@ -1127,7 +1127,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1127
  image_dimensions = {}
1128
 
1129
  out_annotation_boxes = {}
1130
- all_image_annotation_boxes = []
1131
 
1132
  if isinstance(image, Image.Image):
1133
  image_path = move_page_info(str(page))
@@ -1238,10 +1238,25 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1238
  # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
1239
  ###
1240
 
1241
- def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1242
 
1243
- all_bboxes = []
1244
- merged_bboxes = []
1245
  grouped_bboxes = defaultdict(list)
1246
 
1247
  # Deep copy original bounding boxes to retain them
@@ -1256,7 +1271,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
1256
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
1257
 
1258
  # Reconstruct bounding boxes for substrings of interest
1259
- reconstructed_bboxes = []
1260
  for bbox in bboxes:
1261
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
1262
  for line_text, line_info in combined_results.items():
@@ -1266,7 +1281,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
1266
  start_char = line_text.index(bbox.text)
1267
  end_char = start_char + len(bbox.text)
1268
 
1269
- relevant_words = []
1270
  current_char = 0
1271
  for word in line_info['words']:
1272
  word_end = current_char + len(word['text'])
@@ -1501,8 +1516,8 @@ def redact_image_pdf(file_path:str,
1501
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1502
 
1503
  # If there's data from a previous run (passed in via the DataFrame parameters), add it
1504
- all_line_level_ocr_results_list = []
1505
- all_pages_decision_process_list = []
1506
 
1507
  if not all_page_line_level_ocr_results_df.empty:
1508
  all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
@@ -1513,10 +1528,10 @@ def redact_image_pdf(file_path:str,
1513
  # Go through each page
1514
  for page_no in progress_bar:
1515
 
1516
- handwriting_or_signature_boxes = []
1517
- page_signature_recogniser_results = []
1518
- page_handwriting_recogniser_results = []
1519
- page_line_level_ocr_results_with_words = []
1520
  page_break_return = False
1521
  reported_page_number = str(page_no + 1)
1522
 
@@ -1567,7 +1582,7 @@ def redact_image_pdf(file_path:str,
1567
  )
1568
 
1569
  page_line_level_ocr_results_with_words = matching_page if matching_page else []
1570
- else: page_line_level_ocr_results_with_words = []
1571
 
1572
  if page_line_level_ocr_results_with_words:
1573
  print("Found OCR results for page in existing OCR with words object")
@@ -1581,7 +1596,7 @@ def redact_image_pdf(file_path:str,
1581
 
1582
  # Check if page exists in existing textract data. If not, send to service to analyse
1583
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1584
- text_blocks = []
1585
 
1586
  if not textract_data:
1587
  try:
@@ -1619,7 +1634,7 @@ def redact_image_pdf(file_path:str,
1619
  text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1620
 
1621
  # Check if "pages" key exists, if not, initialise it as an empty list
1622
- if "pages" not in textract_data: textract_data["pages"] = []
1623
 
1624
  # Append the new page data
1625
  textract_data["pages"].append(text_blocks)
@@ -1627,11 +1642,11 @@ def redact_image_pdf(file_path:str,
1627
  except Exception as e:
1628
  out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1629
  print(out_message)
1630
- text_blocks = []
1631
  new_textract_request_metadata = "Failed Textract API call"
1632
 
1633
  # Check if "pages" key exists, if not, initialise it as an empty list
1634
- if "pages" not in textract_data: textract_data["pages"] = []
1635
 
1636
  raise Exception(out_message)
1637
 
@@ -1678,12 +1693,12 @@ def redact_image_pdf(file_path:str,
1678
 
1679
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1680
 
1681
- else: page_redaction_bounding_boxes = []
1682
 
1683
  # Merge redaction bounding boxes that are close together
1684
  page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
1685
 
1686
- else: page_merged_redaction_bboxes = []
1687
 
1688
  # 3. Draw the merged boxes
1689
  ## Apply annotations to pdf with pymupdf
@@ -1710,7 +1725,7 @@ def redact_image_pdf(file_path:str,
1710
  fill = (0, 0, 0) # Fill colour for redactions
1711
  draw = ImageDraw.Draw(image)
1712
 
1713
- all_image_annotations_boxes = []
1714
 
1715
  for box in page_merged_redaction_bboxes:
1716
 
@@ -1914,9 +1929,9 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
1914
  Create OCRResult objects based on a list of pdfminer LTChar objects.
1915
  This version is corrected to use the specified OCRResult class definition.
1916
  """
1917
- line_level_results_out = []
1918
- line_level_characters_out = []
1919
- character_objects_out = []
1920
 
1921
  full_text = ""
1922
  # [x0, y0, x1, y1]
@@ -1943,7 +1958,7 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
1943
  line_level_characters_out.append(character_objects_out)
1944
 
1945
  # Reset for the next line
1946
- character_objects_out = []
1947
  full_text = ""
1948
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
1949
  line_number += 1
@@ -2003,7 +2018,7 @@ def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
2003
  # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
2004
  PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
2005
 
2006
- line_words = []
2007
  current_word_text = ""
2008
  current_word_bbox = [float('inf'), float('inf'), -1, -1] # [x0, y0, x1, y1]
2009
  prev_char = None
@@ -2152,7 +2167,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
2152
  return decision_process_table
2153
 
2154
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
2155
- pikepdf_redaction_annotations_on_page = []
2156
  for analysed_bounding_box in analysed_bounding_boxes:
2157
 
2158
  bounding_box = analysed_bounding_box["boundingBox"]
@@ -2282,7 +2297,7 @@ def redact_text_pdf(
2282
 
2283
  #file_name = get_file_name_without_type(file_path)
2284
 
2285
- if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = []
2286
 
2287
  # Check that page_min and page_max are within expected ranges
2288
  if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
@@ -2315,20 +2330,20 @@ def redact_text_pdf(
2315
  # Go page by page
2316
  for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
2317
 
2318
- all_page_line_text_extraction_characters = []
2319
- all_page_line_level_text_extraction_results_list = []
2320
- page_analyser_results = []
2321
- page_redaction_bounding_boxes = []
2322
 
2323
- characters = []
2324
- pikepdf_redaction_annotations_on_page = []
2325
  page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
2326
  page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
2327
- page_text_ocr_outputs_list = []
2328
 
2329
  text_line_no = 1
2330
  for n, text_container in enumerate(page_layout):
2331
- characters = []
2332
 
2333
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
2334
  characters = get_text_container_characters(text_container)
@@ -2390,7 +2405,7 @@ def redact_text_pdf(
2390
  # Annotate redactions on page
2391
  pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
2392
 
2393
- else: pikepdf_redaction_annotations_on_page = []
2394
 
2395
  # Make pymupdf page redactions
2396
  if redact_whole_page_list:
 
201
  pdf_file_name_with_ext = ""
202
  pdf_file_name_without_ext = ""
203
  page_break_return = False
204
+ blank_request_metadata = list()
205
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
206
  review_out_file_paths = [prepared_pdf_file_paths[0]]
207
 
 
387
  if not in_allow_list.empty:
388
  in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
389
  else:
390
+ in_allow_list_flat = list()
391
 
392
  # If string, assume file path
393
  if isinstance(custom_recogniser_word_list, str):
 
396
  if not custom_recogniser_word_list.empty:
397
  custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
398
  else:
399
+ custom_recogniser_word_list_flat = list()
400
 
401
  # Sort the strings in order from the longest string to the shortest
402
  custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
 
412
  print("Could not convert whole page redaction data to number list due to:", e)
413
  redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
414
  else:
415
+ redact_whole_page_list_flat = list()
416
 
417
 
418
 
 
1100
  else:
1101
  page.set_cropbox(original_cropbox)
1102
 
1103
+ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]= list(), page_sizes_df:pd.DataFrame=pd.DataFrame()):
1104
 
1105
  rect_height = page.rect.height
1106
  rect_width = page.rect.width
 
1127
  image_dimensions = {}
1128
 
1129
  out_annotation_boxes = {}
1130
+ all_image_annotation_boxes = list()
1131
 
1132
  if isinstance(image, Image.Image):
1133
  image_path = move_page_info(str(page))
 
1238
  # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
1239
  ###
1240
 
1241
+ def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogniser_results: list = list(), page_handwriting_recogniser_results: list = list(), handwrite_signature_checkbox: List[str] = ["Extract handwriting", "Extract signatures"], horizontal_threshold: int = 50, vertical_threshold: int = 12):
1242
+ """
1243
+ Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
1244
+
1245
+ Args:
1246
+ bboxes (list): A list of bounding boxes to be merged.
1247
+ combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
1248
+ page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
1249
+ page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
1250
+ handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1251
+ horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
1252
+ vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
1253
+
1254
+ Returns:
1255
+ None: This function modifies the bounding boxes in place and does not return a value.
1256
+ """
1257
 
1258
+ all_bboxes = list()
1259
+ merged_bboxes = list()
1260
  grouped_bboxes = defaultdict(list)
1261
 
1262
  # Deep copy original bounding boxes to retain them
 
1271
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
1272
 
1273
  # Reconstruct bounding boxes for substrings of interest
1274
+ reconstructed_bboxes = list()
1275
  for bbox in bboxes:
1276
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
1277
  for line_text, line_info in combined_results.items():
 
1281
  start_char = line_text.index(bbox.text)
1282
  end_char = start_char + len(bbox.text)
1283
 
1284
+ relevant_words = list()
1285
  current_char = 0
1286
  for word in line_info['words']:
1287
  word_end = current_char + len(word['text'])
 
1516
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1517
 
1518
  # If there's data from a previous run (passed in via the DataFrame parameters), add it
1519
+ all_line_level_ocr_results_list = list()
1520
+ all_pages_decision_process_list = list()
1521
 
1522
  if not all_page_line_level_ocr_results_df.empty:
1523
  all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
 
1528
  # Go through each page
1529
  for page_no in progress_bar:
1530
 
1531
+ handwriting_or_signature_boxes = list()
1532
+ page_signature_recogniser_results = list()
1533
+ page_handwriting_recogniser_results = list()
1534
+ page_line_level_ocr_results_with_words = list()
1535
  page_break_return = False
1536
  reported_page_number = str(page_no + 1)
1537
 
 
1582
  )
1583
 
1584
  page_line_level_ocr_results_with_words = matching_page if matching_page else []
1585
+ else: page_line_level_ocr_results_with_words = list()
1586
 
1587
  if page_line_level_ocr_results_with_words:
1588
  print("Found OCR results for page in existing OCR with words object")
 
1596
 
1597
  # Check if page exists in existing textract data. If not, send to service to analyse
1598
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1599
+ text_blocks = list()
1600
 
1601
  if not textract_data:
1602
  try:
 
1634
  text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1635
 
1636
  # Check if "pages" key exists, if not, initialise it as an empty list
1637
+ if "pages" not in textract_data: textract_data["pages"] = list()
1638
 
1639
  # Append the new page data
1640
  textract_data["pages"].append(text_blocks)
 
1642
  except Exception as e:
1643
  out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1644
  print(out_message)
1645
+ text_blocks = list()
1646
  new_textract_request_metadata = "Failed Textract API call"
1647
 
1648
  # Check if "pages" key exists, if not, initialise it as an empty list
1649
+ if "pages" not in textract_data: textract_data["pages"] = list()
1650
 
1651
  raise Exception(out_message)
1652
 
 
1693
 
1694
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1695
 
1696
+ else: page_redaction_bounding_boxes = list()
1697
 
1698
  # Merge redaction bounding boxes that are close together
1699
  page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
1700
 
1701
+ else: page_merged_redaction_bboxes = list()
1702
 
1703
  # 3. Draw the merged boxes
1704
  ## Apply annotations to pdf with pymupdf
 
1725
  fill = (0, 0, 0) # Fill colour for redactions
1726
  draw = ImageDraw.Draw(image)
1727
 
1728
+ all_image_annotations_boxes = list()
1729
 
1730
  for box in page_merged_redaction_bboxes:
1731
 
 
1929
  Create OCRResult objects based on a list of pdfminer LTChar objects.
1930
  This version is corrected to use the specified OCRResult class definition.
1931
  """
1932
+ line_level_results_out = list()
1933
+ line_level_characters_out = list()
1934
+ character_objects_out = list()
1935
 
1936
  full_text = ""
1937
  # [x0, y0, x1, y1]
 
1958
  line_level_characters_out.append(character_objects_out)
1959
 
1960
  # Reset for the next line
1961
+ character_objects_out = list()
1962
  full_text = ""
1963
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
1964
  line_number += 1
 
2018
  # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
2019
  PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
2020
 
2021
+ line_words = list()
2022
  current_word_text = ""
2023
  current_word_bbox = [float('inf'), float('inf'), -1, -1] # [x0, y0, x1, y1]
2024
  prev_char = None
 
2167
  return decision_process_table
2168
 
2169
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
2170
+ pikepdf_redaction_annotations_on_page = list()
2171
  for analysed_bounding_box in analysed_bounding_boxes:
2172
 
2173
  bounding_box = analysed_bounding_box["boundingBox"]
 
2297
 
2298
  #file_name = get_file_name_without_type(file_path)
2299
 
2300
+ if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = list()
2301
 
2302
  # Check that page_min and page_max are within expected ranges
2303
  if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
 
2330
  # Go page by page
2331
  for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
2332
 
2333
+ all_page_line_text_extraction_characters = list()
2334
+ all_page_line_level_text_extraction_results_list = list()
2335
+ page_analyser_results = list()
2336
+ page_redaction_bounding_boxes = list()
2337
 
2338
+ characters = list()
2339
+ pikepdf_redaction_annotations_on_page = list()
2340
  page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
2341
  page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
2342
+ page_text_ocr_outputs_list = list()
2343
 
2344
  text_line_no = 1
2345
  for n, text_container in enumerate(page_layout):
2346
+ characters = list()
2347
 
2348
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
2349
  characters = get_text_container_characters(text_container)
 
2405
  # Annotate redactions on page
2406
  pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
2407
 
2408
+ else: pikepdf_redaction_annotations_on_page = list()
2409
 
2410
  # Make pymupdf page redactions
2411
  if redact_whole_page_list:
tools/redaction_review.py CHANGED
@@ -99,8 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
99
  recogniser_dropdown_value:str,
100
  text_dropdown_value:str,
101
  page_dropdown_value:str,
102
- review_df:pd.DataFrame=[],
103
- page_sizes:List[str]=[]):
104
  '''
105
  Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
106
  '''
@@ -147,7 +147,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
147
 
148
  return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
149
 
150
- def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
151
  '''
152
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
153
  '''
@@ -265,7 +265,7 @@ def update_annotator_page_from_review_df(
265
  if not current_page_review_df.empty:
266
  # Convert the current page's review data to annotation list format for *this page*
267
 
268
- current_page_annotations_list = []
269
  # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
270
  # Assuming review_df has compatible columns
271
  expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
@@ -340,7 +340,7 @@ def update_annotator_page_from_review_df(
340
  if not page_sizes_df.empty:
341
  page_sizes = page_sizes_df.to_dict(orient='records')
342
  else:
343
- page_sizes = [] # Ensure page_sizes is a list if df is empty
344
 
345
  # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
346
  # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
@@ -609,7 +609,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
609
  merged_df = merged_df.sort_values('image')
610
 
611
 
612
- final_annotations_list = []
613
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
614
 
615
  # Now, when we group, we use `sort=False`. This tells groupby to respect the
@@ -622,7 +622,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
622
  # Check if the group has actual annotations. iloc[0] is safe because even pages
623
  # without annotations will have one row with NaN values from the merge.
624
  if pd.isna(group.iloc[0].get('id')):
625
- boxes = []
626
  else:
627
  valid_box_cols = [col for col in box_cols if col in group.columns]
628
  # We should also sort the boxes within a page for consistency (e.g., left-to-right)
@@ -751,7 +751,7 @@ def update_annotator_object_and_filter_df(
751
  recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
752
  zoom:int=100,
753
  review_df:pd.DataFrame=None, # Use None for default empty DataFrame
754
- page_sizes:List[dict]=[],
755
  doc_full_file_name_textbox:str='',
756
  input_folder:str=INPUT_FOLDER
757
  ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
@@ -775,7 +775,7 @@ def update_annotator_object_and_filter_df(
775
  # Return blank/default outputs
776
 
777
  blank_annotator = image_annotator(
778
- value = None, boxes_alpha=0.1, box_thickness=1, label_list=[], label_colors=[],
779
  show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
780
  box_selected_thickness=2, handle_size=4, sources=None,
781
  show_clear_button=False, show_share_button=False, show_remove_button=False,
@@ -851,7 +851,7 @@ def update_annotator_object_and_filter_df(
851
  if not page_sizes_df.empty:
852
  page_sizes = page_sizes_df.to_dict(orient='records')
853
  else:
854
- page_sizes = [] # Ensure page_sizes is a list if df is empty
855
 
856
  # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
857
  current_page_image_annotator_object = None
@@ -907,12 +907,12 @@ def update_annotator_object_and_filter_df(
907
 
908
  except Exception as e:
909
  print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
910
- recogniser_entities_list = []
911
- recogniser_colour_list = []
912
  recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
913
  recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
914
- text_entities_drop = []
915
- page_entities_drop = []
916
 
917
 
918
  # --- Final Output Components ---
@@ -946,7 +946,7 @@ def update_annotator_object_and_filter_df(
946
  interactive=True # Keep interactive if data is present
947
  )
948
 
949
- page_entities_drop_redaction_list = []
950
  all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
951
  page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
952
 
@@ -970,7 +970,7 @@ def update_all_page_annotation_object_based_on_previous_page(
970
  current_page:int,
971
  previous_page:int,
972
  all_image_annotations:List[AnnotatedImageData],
973
- page_sizes:List[dict]=[],
974
  clear_all:bool=False
975
  ):
976
  '''
@@ -991,7 +991,7 @@ def update_all_page_annotation_object_based_on_previous_page(
991
  page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
992
 
993
  if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
994
- else: all_image_annotations[previous_page_zero_index]["boxes"] = []
995
 
996
  return all_image_annotations, current_page, current_page
997
 
@@ -1003,16 +1003,16 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
1003
  review_file_state:pd.DataFrame,
1004
  output_folder:str = OUTPUT_FOLDER,
1005
  save_pdf:bool=True,
1006
- page_sizes:List[dict]=[],
1007
  COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
1008
  progress=gr.Progress(track_tqdm=True)):
1009
  '''
1010
  Apply modified redactions to a pymupdf and export review files.
1011
  '''
1012
 
1013
- output_files = []
1014
- output_log_files = []
1015
- pdf_doc = []
1016
  review_df = review_file_state
1017
 
1018
  page_image_annotator_object = all_image_annotations[current_page - 1]
@@ -1078,7 +1078,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
1078
  doc = [image]
1079
 
1080
  elif file_extension in '.csv':
1081
- pdf_doc = []
1082
 
1083
  # If working with pdfs
1084
  elif is_pdf(file_path) == True:
@@ -1088,7 +1088,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
1088
  output_files.append(orig_pdf_file_path)
1089
 
1090
  number_of_pages = pdf_doc.page_count
1091
- original_cropboxes = []
1092
 
1093
  page_sizes_df = pd.DataFrame(page_sizes)
1094
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
@@ -1619,7 +1619,7 @@ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float,
1619
 
1620
  return x1, adobe_y1, x2, adobe_y2
1621
 
1622
- def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=[], document_cropboxes:List=[], page_sizes:List[dict]=[]):
1623
  '''
1624
  Create an xfdf file from a review csv file and a pdf
1625
  '''
@@ -1711,11 +1711,11 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
1711
  reparsed = minidom.parseString(rough_string)
1712
  return reparsed.toxml() #.toprettyxml(indent=" ")
1713
 
1714
- def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=[], page_sizes:List[dict]=[]):
1715
  '''
1716
  Load in files to convert a review file into an Adobe comment file format
1717
  '''
1718
- output_paths = []
1719
  pdf_name = ""
1720
  file_path_name = ""
1721
 
@@ -1814,7 +1814,7 @@ def parse_xfdf(xfdf_path:str):
1814
  # Define the namespace
1815
  namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
1816
 
1817
- redactions = []
1818
 
1819
  # Find all redact elements using the namespace
1820
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
@@ -1846,8 +1846,8 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
1846
  Returns:
1847
  - DataFrame containing redaction information
1848
  '''
1849
- output_paths = []
1850
- xfdf_paths = []
1851
  df = pd.DataFrame()
1852
 
1853
  # Sort the file paths so that the pdfs come first
 
99
  recogniser_dropdown_value:str,
100
  text_dropdown_value:str,
101
  page_dropdown_value:str,
102
+ review_df:pd.DataFrame=list(),
103
+ page_sizes:List[str]=list()):
104
  '''
105
  Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
106
  '''
 
147
 
148
  return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
149
 
150
+ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=list(), page_sizes:list[str]=list()):
151
  '''
152
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
153
  '''
 
265
  if not current_page_review_df.empty:
266
  # Convert the current page's review data to annotation list format for *this page*
267
 
268
+ current_page_annotations_list = list()
269
  # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
270
  # Assuming review_df has compatible columns
271
  expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
 
340
  if not page_sizes_df.empty:
341
  page_sizes = page_sizes_df.to_dict(orient='records')
342
  else:
343
+ page_sizes = list() # Ensure page_sizes is a list if df is empty
344
 
345
  # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
346
  # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
 
609
  merged_df = merged_df.sort_values('image')
610
 
611
 
612
+ final_annotations_list = list()
613
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
614
 
615
  # Now, when we group, we use `sort=False`. This tells groupby to respect the
 
622
  # Check if the group has actual annotations. iloc[0] is safe because even pages
623
  # without annotations will have one row with NaN values from the merge.
624
  if pd.isna(group.iloc[0].get('id')):
625
+ boxes = list()
626
  else:
627
  valid_box_cols = [col for col in box_cols if col in group.columns]
628
  # We should also sort the boxes within a page for consistency (e.g., left-to-right)
 
751
  recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
752
  zoom:int=100,
753
  review_df:pd.DataFrame=None, # Use None for default empty DataFrame
754
+ page_sizes:List[dict]=list(),
755
  doc_full_file_name_textbox:str='',
756
  input_folder:str=INPUT_FOLDER
757
  ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
 
775
  # Return blank/default outputs
776
 
777
  blank_annotator = image_annotator(
778
+ value = None, boxes_alpha=0.1, box_thickness=1, label_list=list(), label_colors=list(),
779
  show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
780
  box_selected_thickness=2, handle_size=4, sources=None,
781
  show_clear_button=False, show_share_button=False, show_remove_button=False,
 
851
  if not page_sizes_df.empty:
852
  page_sizes = page_sizes_df.to_dict(orient='records')
853
  else:
854
+ page_sizes = list() # Ensure page_sizes is a list if df is empty
855
 
856
  # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
857
  current_page_image_annotator_object = None
 
907
 
908
  except Exception as e:
909
  print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
910
+ recogniser_entities_list = list()
911
+ recogniser_colour_list = list()
912
  recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
913
  recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
914
+ text_entities_drop = list()
915
+ page_entities_drop = list()
916
 
917
 
918
  # --- Final Output Components ---
 
946
  interactive=True # Keep interactive if data is present
947
  )
948
 
949
+ page_entities_drop_redaction_list = list()
950
  all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
951
  page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
952
 
 
970
  current_page:int,
971
  previous_page:int,
972
  all_image_annotations:List[AnnotatedImageData],
973
+ page_sizes:List[dict]=list(),
974
  clear_all:bool=False
975
  ):
976
  '''
 
991
  page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
992
 
993
  if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
994
+ else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
995
 
996
  return all_image_annotations, current_page, current_page
997
 
 
1003
  review_file_state:pd.DataFrame,
1004
  output_folder:str = OUTPUT_FOLDER,
1005
  save_pdf:bool=True,
1006
+ page_sizes:List[dict]=list(),
1007
  COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
1008
  progress=gr.Progress(track_tqdm=True)):
1009
  '''
1010
  Apply modified redactions to a pymupdf and export review files.
1011
  '''
1012
 
1013
+ output_files = list()
1014
+ output_log_files = list()
1015
+ pdf_doc = list()
1016
  review_df = review_file_state
1017
 
1018
  page_image_annotator_object = all_image_annotations[current_page - 1]
 
1078
  doc = [image]
1079
 
1080
  elif file_extension in '.csv':
1081
+ pdf_doc = list()
1082
 
1083
  # If working with pdfs
1084
  elif is_pdf(file_path) == True:
 
1088
  output_files.append(orig_pdf_file_path)
1089
 
1090
  number_of_pages = pdf_doc.page_count
1091
+ original_cropboxes = list()
1092
 
1093
  page_sizes_df = pd.DataFrame(page_sizes)
1094
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
 
1619
 
1620
  return x1, adobe_y1, x2, adobe_y2
1621
 
1622
+ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=list(), document_cropboxes:List=list(), page_sizes:List[dict]=list()):
1623
  '''
1624
  Create an xfdf file from a review csv file and a pdf
1625
  '''
 
1711
  reparsed = minidom.parseString(rough_string)
1712
  return reparsed.toxml() #.toprettyxml(indent=" ")
1713
 
1714
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=list(), page_sizes:List[dict]=list()):
1715
  '''
1716
  Load in files to convert a review file into an Adobe comment file format
1717
  '''
1718
+ output_paths = list()
1719
  pdf_name = ""
1720
  file_path_name = ""
1721
 
 
1814
  # Define the namespace
1815
  namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
1816
 
1817
+ redactions = list()
1818
 
1819
  # Find all redact elements using the namespace
1820
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
 
1846
  Returns:
1847
  - DataFrame containing redaction information
1848
  '''
1849
+ output_paths = list()
1850
+ xfdf_paths = list()
1851
  df = pd.DataFrame()
1852
 
1853
  # Sort the file paths so that the pdfs come first