seanpedrickcase commited on
Commit
bc4bdbd
·
1 Parent(s): bbf818d

Can now select only specific pages in document to redact. Image based redaction should work correctly now.

Browse files
Files changed (4) hide show
  1. app.py +30 -8
  2. requirements.txt +3 -3
  3. tools/file_conversion.py +9 -4
  4. tools/file_redaction.py +112 -52
app.py CHANGED
@@ -27,6 +27,19 @@ language = 'en'
27
  feedback_data_folder = 'feedback/' + today_rev + '/'
28
  logs_data_folder = 'logs/' + today_rev + '/'
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Create the gradio interface
31
  app = gr.Blocks(theme = gr.themes.Base())
32
 
@@ -42,16 +55,20 @@ with app:
42
 
43
  session_hash_state = gr.State()
44
  s3_output_folder_state = gr.State()
 
 
 
45
  feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
46
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
47
  usage_logs_state = gr.State(logs_data_folder + 'log.csv')
48
  usage_s3_logs_loc_state = gr.State(logs_data_folder)
 
49
 
50
  gr.Markdown(
51
  """
52
  # Document redaction
53
 
54
- Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
55
 
56
  WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
57
 
@@ -115,6 +132,9 @@ with app:
115
  """)
116
  with gr.Accordion("Settings for documents", open = True):
117
  in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
 
 
 
118
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
119
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
120
 
@@ -143,12 +163,12 @@ with app:
143
 
144
  # Document redaction
145
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
146
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
147
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
148
 
149
  # If the output file count text box changes, keep going with redacting each document until done
150
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
151
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
152
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
153
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
154
 
@@ -162,9 +182,11 @@ with app:
162
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
163
 
164
  #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
165
- # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
166
-
167
- app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
 
 
168
 
169
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
170
  callback = gr.CSVLogger()
@@ -190,6 +212,6 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
190
 
191
  if __name__ == "__main__":
192
  if os.environ['COGNITO_AUTH'] == "1":
193
- app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='10mb')
194
  else:
195
- app.queue().launch(show_error=True, inbrowser=True, max_file_size='10mb')
 
27
  feedback_data_folder = 'feedback/' + today_rev + '/'
28
  logs_data_folder = 'logs/' + today_rev + '/'
29
 
30
+ def create_logs_folder(session_hash_textbox):
31
+ print("session_hash_textbox", session_hash_textbox)
32
+
33
+ feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
34
+ logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
35
+
36
+ feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
37
+ feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
38
+ usage_logs_state = gr.State(logs_data_folder + 'log.csv')
39
+ usage_s3_logs_loc_state = gr.State(logs_data_folder)
40
+
41
+ return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
42
+
43
  # Create the gradio interface
44
  app = gr.Blocks(theme = gr.themes.Base())
45
 
 
55
 
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
58
+
59
+
60
+
61
  feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
62
  feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
63
  usage_logs_state = gr.State(logs_data_folder + 'log.csv')
64
  usage_s3_logs_loc_state = gr.State(logs_data_folder)
65
+
66
 
67
  gr.Markdown(
68
  """
69
  # Document redaction
70
 
71
+ Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction. If you are getting 0 redactions, it's possible that the text in the document is saved in image format instead of as selectable text. Select 'Image analysis' on the Settings page in this case.
72
 
73
  WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
74
 
 
132
  """)
133
  with gr.Accordion("Settings for documents", open = True):
134
  in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
135
+ with gr.Row():
136
+ page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
137
+ page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
138
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
139
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
140
 
 
163
 
164
  # Document redaction
165
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
166
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
167
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
168
 
169
  # If the output file count text box changes, keep going with redacting each document until done
170
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
171
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
172
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
173
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
174
 
 
182
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
183
 
184
  #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
185
+ # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
186
+
187
+
188
+ app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
189
+ #then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
190
 
191
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
192
  callback = gr.CSVLogger()
 
212
 
213
  if __name__ == "__main__":
214
  if os.environ['COGNITO_AUTH'] == "1":
215
+ app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
216
  else:
217
+ app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  opencv-python==4.9.0.80
4
- presidio_analyzer==2.2.354
5
- presidio_anonymizer==2.2.354
6
- presidio-image-redactor==0.0.52
7
  pikepdf==8.15.1
8
  pandas==2.2.2
9
  spacy==3.7.5
 
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  opencv-python==4.9.0.80
4
+ presidio_analyzer==2.2.355
5
+ presidio_anonymizer==2.2.355
6
+ presidio-image-redactor==0.0.53
7
  pikepdf==8.15.1
8
  pandas==2.2.2
9
  spacy==3.7.5
tools/file_conversion.py CHANGED
@@ -36,7 +36,7 @@ def is_pdf(filename):
36
  # %%
37
  ## Convert pdf to image if necessary
38
 
39
- def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
40
 
41
  # Get the number of pages in the PDF
42
  page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -46,21 +46,26 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
46
 
47
  # Open the PDF file
48
  #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
49
- for page_num in range(0,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
50
 
51
  # print("Current page: ", str(page_num + 1))
52
 
53
  # Convert one page to image
54
  image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
55
 
 
56
  # If no images are returned, break the loop
57
  if not image:
58
  print("Conversion of page", str(page_num), "to file failed.")
59
  break
60
 
 
 
 
61
  images.extend(image)
62
 
63
  print("PDF has been converted to images.")
 
64
 
65
  return images
66
 
@@ -146,7 +151,7 @@ def prepare_image_or_text_pdf(
146
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
147
 
148
  file_paths_loop = [file_paths[int(latest_file_completed)]]
149
- print("file_paths_loop:", str(file_paths_loop))
150
 
151
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
152
  for file in file_paths_loop:
@@ -169,7 +174,7 @@ def prepare_image_or_text_pdf(
169
  return out_message, out_file_paths
170
 
171
  out_file_path = process_file(file_path)
172
- print("Out file path at image conversion step:", out_file_path)
173
 
174
  elif in_redact_method == "Text analysis":
175
  if is_pdf(file_path) == False:
 
36
  # %%
37
  ## Convert pdf to image if necessary
38
 
39
+ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
40
 
41
  # Get the number of pages in the PDF
42
  page_count = pdfinfo_from_path(pdf_path)['Pages']
 
46
 
47
  # Open the PDF file
48
  #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
49
+ for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
50
 
51
  # print("Current page: ", str(page_num + 1))
52
 
53
  # Convert one page to image
54
  image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
55
 
56
+
57
  # If no images are returned, break the loop
58
  if not image:
59
  print("Conversion of page", str(page_num), "to file failed.")
60
  break
61
 
62
+ # print("Conversion of page", str(page_num), "to file succeeded.")
63
+ # print("image:", image)
64
+
65
  images.extend(image)
66
 
67
  print("PDF has been converted to images.")
68
+ # print("Images:", images)
69
 
70
  return images
71
 
 
151
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
152
 
153
  file_paths_loop = [file_paths[int(latest_file_completed)]]
154
+ #print("file_paths_loop:", str(file_paths_loop))
155
 
156
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
157
  for file in file_paths_loop:
 
174
  return out_message, out_file_paths
175
 
176
  out_file_path = process_file(file_path)
177
+ #print("Out file path at image conversion step:", out_file_path)
178
 
179
  elif in_redact_method == "Text analysis":
180
  if is_pdf(file_path) == False:
tools/file_redaction.py CHANGED
@@ -18,7 +18,7 @@ from tools.data_anonymise import generate_decision_process_output
18
  import gradio as gr
19
 
20
 
21
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
@@ -73,7 +73,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
73
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
74
 
75
  print("Redacting file as image-based file")
76
- pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
77
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
78
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
79
 
@@ -97,7 +97,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
97
 
98
  # Analyse text-based pdf
99
  print('Redacting file as text-based PDF')
100
- pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
101
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
102
  pdf_text.save(out_text_file_path)
103
 
@@ -175,12 +175,13 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
175
  merged_bboxes.append(merged_box)
176
  return merged_bboxes
177
 
178
- def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
179
  '''
180
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
181
  '''
182
 
183
  fill = (0, 0, 0)
 
184
 
185
  if not image_paths:
186
  out_message = "PDF does not exist as images. Converting pages to image"
@@ -190,59 +191,101 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
190
  image_paths = process_file(file_path)
191
 
192
  images = []
193
- number_of_pages = len(image_paths)
 
 
 
 
194
 
195
  out_message = "Redacting pages"
196
  print(out_message)
197
  #progress(0.1, desc=out_message)
198
 
199
- #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
200
- for i in range(0, number_of_pages):
201
-
202
- print("Redacting page", str(i + 1))
203
-
204
- # Get the image to redact using PIL lib (pillow)
205
- #print("image_paths:", image_paths)
206
 
207
- image = ImageChops.duplicate(image_paths[i])
 
 
 
208
 
209
- # %%
210
- image_analyser = ImageAnalyzerEngine(nlp_analyser)
211
- engine = ImageRedactorEngine(image_analyser)
212
 
213
- if language == 'en':
214
- ocr_lang = 'eng'
215
- else: ocr_lang = language
216
-
217
- bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
218
- **{
219
- "allow_list": allow_list,
220
- "language": language,
221
- "entities": chosen_redact_entities,
222
- "score_threshold": score_threshold,
223
- "return_decision_process":True,
224
- })
225
-
226
- # Text placeholder in this processing step, as the analyze method does not return the OCR text
227
- if bboxes:
228
- decision_process_output_str = str(bboxes)
229
- print("Decision process:", decision_process_output_str)
230
-
231
- #print("For page: ", str(i), "Bounding boxes: ", bboxes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- draw = ImageDraw.Draw(image)
234
-
235
- merged_bboxes = merge_img_bboxes(bboxes)
236
 
237
- #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
238
 
239
- # 3. Draw the merged boxes (unchanged)
240
- for box in merged_bboxes:
241
- x0 = box.left
242
- y0 = box.top
243
- x1 = x0 + box.width
244
- y1 = y0 + box.height
245
- draw.rectangle([x0, y0, x1, y1], fill=fill)
246
 
247
  images.append(image)
248
 
@@ -358,7 +401,7 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
358
  annotations_on_page.append(annotation)
359
  return annotations_on_page
360
 
361
- def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
362
  '''
363
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
364
  '''
@@ -370,13 +413,30 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
370
  pdf = Pdf.open(filename)
371
  page_num = 0
372
 
373
- for page in pdf.pages:
374
- print("Page number is:", page_num + 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  annotations_on_page = []
377
  decision_process_table_on_page = []
378
 
379
- for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
380
 
381
  page_analyzer_results = []
382
  page_analyzed_bounding_boxes = []
@@ -403,8 +463,8 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
403
  annotations_all_pages.extend([annotations_on_page])
404
  decision_process_table_all_pages.extend([decision_process_table_on_page])
405
 
406
- print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
407
 
408
- page_num += 1
409
 
410
  return pdf, decision_process_table_all_pages
 
18
  import gradio as gr
19
 
20
 
21
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
 
73
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
74
 
75
  print("Redacting file as image-based file")
76
+ pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max)
77
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
78
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
79
 
 
97
 
98
  # Analyse text-based pdf
99
  print('Redacting file as text-based PDF')
100
+ pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max)
101
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
102
  pdf_text.save(out_text_file_path)
103
 
 
175
  merged_bboxes.append(merged_box)
176
  return merged_bboxes
177
 
178
+ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
179
  '''
180
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
181
  '''
182
 
183
  fill = (0, 0, 0)
184
+ decision_process_output_str = ""
185
 
186
  if not image_paths:
187
  out_message = "PDF does not exist as images. Converting pages to image"
 
191
  image_paths = process_file(file_path)
192
 
193
  images = []
194
+
195
+ #print("Image paths:", image_paths)
196
+ number_of_pages = len(image_paths[0])
197
+
198
+ print("Number of pages:", str(number_of_pages))
199
 
200
  out_message = "Redacting pages"
201
  print(out_message)
202
  #progress(0.1, desc=out_message)
203
 
204
+ # Check that page_min and page_max are within expected ranges
205
+ if page_max > number_of_pages or page_max == 0:
206
+ page_max = number_of_pages
207
+ #else:
208
+ # page_max = page_max - 1
 
 
209
 
210
+ if page_min <= 0:
211
+ page_min = 0
212
+ else:
213
+ page_min = page_min - 1
214
 
215
+ print("Page range:", str(page_min), "to", str(page_max))
 
 
216
 
217
+ #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
218
+
219
+ for n in range(0, number_of_pages):
220
+
221
+ try:
222
+ image = image_paths[0][n]#.copy()
223
+ print("Skipping page", str(n))
224
+ #print("image:", image)
225
+ except Exception as e:
226
+ print("Could not redact page:", str(i), "due to:")
227
+ print(e)
228
+ continue
229
+
230
+ if n >= page_min and n <= page_max:
231
+ #for i in range(page_min, page_max):
232
+
233
+ i = n
234
+
235
+ print("Redacting page", str(i))
236
+
237
+ # Get the image to redact using PIL lib (pillow)
238
+ #print("image_paths:", image_paths)
239
+
240
+ #image = ImageChops.duplicate(image_paths[i])
241
+ #print("Image paths i:", image_paths[0])
242
+
243
+ # Assuming image_paths[i] is your PIL image object
244
+ try:
245
+ image = image_paths[0][i]#.copy()
246
+ #print("image:", image)
247
+ except Exception as e:
248
+ print("Could not redact page:", str(i), "due to:")
249
+ print(e)
250
+ continue
251
+
252
+ # %%
253
+ image_analyser = ImageAnalyzerEngine(nlp_analyser)
254
+ engine = ImageRedactorEngine(image_analyser)
255
+
256
+ if language == 'en':
257
+ ocr_lang = 'eng'
258
+ else: ocr_lang = language
259
+
260
+ bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
261
+ **{
262
+ "allow_list": allow_list,
263
+ "language": language,
264
+ "entities": chosen_redact_entities,
265
+ "score_threshold": score_threshold,
266
+ "return_decision_process":True,
267
+ })
268
+
269
+ # Text placeholder in this processing step, as the analyze method does not return the OCR text
270
+ if bboxes:
271
+ decision_process_output_str = str(bboxes)
272
+ print("Decision process:", decision_process_output_str)
273
+
274
+ #print("For page: ", str(i), "Bounding boxes: ", bboxes)
275
 
276
+ draw = ImageDraw.Draw(image)
277
+
278
+ merged_bboxes = merge_img_bboxes(bboxes)
279
 
280
+ #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
281
 
282
+ # 3. Draw the merged boxes (unchanged)
283
+ for box in merged_bboxes:
284
+ x0 = box.left
285
+ y0 = box.top
286
+ x1 = x0 + box.width
287
+ y1 = y0 + box.height
288
+ draw.rectangle([x0, y0, x1, y1], fill=fill)
289
 
290
  images.append(image)
291
 
 
401
  annotations_on_page.append(annotation)
402
  return annotations_on_page
403
 
404
+ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
405
  '''
406
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
407
  '''
 
413
  pdf = Pdf.open(filename)
414
  page_num = 0
415
 
416
+ number_of_pages = len(pdf.pages)
417
+
418
+ # Check that page_min and page_max are within expected ranges
419
+ if page_max > number_of_pages or page_max == 0:
420
+ page_max = number_of_pages
421
+ #else:
422
+ # page_max = page_max - 1
423
+
424
+ if page_min <= 0:
425
+ page_min = 0
426
+ else:
427
+ page_min = page_min - 1
428
+
429
+ print("Page range is",str(page_min), "to", str(page_max))
430
+
431
+ for page_no in range(page_min, page_max):
432
+ page = pdf.pages[page_no]
433
+
434
+ print("Page number is:", page_no)
435
 
436
  annotations_on_page = []
437
  decision_process_table_on_page = []
438
 
439
+ for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
440
 
441
  page_analyzer_results = []
442
  page_analyzed_bounding_boxes = []
 
463
  annotations_all_pages.extend([annotations_on_page])
464
  decision_process_table_all_pages.extend([decision_process_table_on_page])
465
 
466
+ print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
467
 
468
+ #page_num += 1
469
 
470
  return pdf, decision_process_table_all_pages