Spaces:
Sleeping
Sleeping
Commit
·
bc4bdbd
1
Parent(s):
bbf818d
Can now select only specific pages in document to redact. Image based redaction should work correctly now.
Browse files- app.py +30 -8
- requirements.txt +3 -3
- tools/file_conversion.py +9 -4
- tools/file_redaction.py +112 -52
app.py
CHANGED
@@ -27,6 +27,19 @@ language = 'en'
|
|
27 |
feedback_data_folder = 'feedback/' + today_rev + '/'
|
28 |
logs_data_folder = 'logs/' + today_rev + '/'
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# Create the gradio interface
|
31 |
app = gr.Blocks(theme = gr.themes.Base())
|
32 |
|
@@ -42,16 +55,20 @@ with app:
|
|
42 |
|
43 |
session_hash_state = gr.State()
|
44 |
s3_output_folder_state = gr.State()
|
|
|
|
|
|
|
45 |
feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
|
46 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
47 |
usage_logs_state = gr.State(logs_data_folder + 'log.csv')
|
48 |
usage_s3_logs_loc_state = gr.State(logs_data_folder)
|
|
|
49 |
|
50 |
gr.Markdown(
|
51 |
"""
|
52 |
# Document redaction
|
53 |
|
54 |
-
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
55 |
|
56 |
WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
57 |
|
@@ -115,6 +132,9 @@ with app:
|
|
115 |
""")
|
116 |
with gr.Accordion("Settings for documents", open = True):
|
117 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
|
|
|
|
|
|
118 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
119 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
120 |
|
@@ -143,12 +163,12 @@ with app:
|
|
143 |
|
144 |
# Document redaction
|
145 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
146 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
|
147 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
|
148 |
|
149 |
# If the output file count text box changes, keep going with redacting each document until done
|
150 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
151 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
|
152 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
|
153 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
154 |
|
@@ -162,9 +182,11 @@ with app:
|
|
162 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
163 |
|
164 |
#app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
|
165 |
-
# then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
166 |
-
|
167 |
-
|
|
|
|
|
168 |
|
169 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
170 |
callback = gr.CSVLogger()
|
@@ -190,6 +212,6 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
|
190 |
|
191 |
if __name__ == "__main__":
|
192 |
if os.environ['COGNITO_AUTH'] == "1":
|
193 |
-
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='
|
194 |
else:
|
195 |
-
app.queue().launch(show_error=True, inbrowser=True, max_file_size='
|
|
|
27 |
feedback_data_folder = 'feedback/' + today_rev + '/'
|
28 |
logs_data_folder = 'logs/' + today_rev + '/'
|
29 |
|
30 |
+
def create_logs_folder(session_hash_textbox):
|
31 |
+
print("session_hash_textbox", session_hash_textbox)
|
32 |
+
|
33 |
+
feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
|
34 |
+
logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
|
35 |
+
|
36 |
+
feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
|
37 |
+
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
38 |
+
usage_logs_state = gr.State(logs_data_folder + 'log.csv')
|
39 |
+
usage_s3_logs_loc_state = gr.State(logs_data_folder)
|
40 |
+
|
41 |
+
return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
|
42 |
+
|
43 |
# Create the gradio interface
|
44 |
app = gr.Blocks(theme = gr.themes.Base())
|
45 |
|
|
|
55 |
|
56 |
session_hash_state = gr.State()
|
57 |
s3_output_folder_state = gr.State()
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
|
62 |
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
63 |
usage_logs_state = gr.State(logs_data_folder + 'log.csv')
|
64 |
usage_s3_logs_loc_state = gr.State(logs_data_folder)
|
65 |
+
|
66 |
|
67 |
gr.Markdown(
|
68 |
"""
|
69 |
# Document redaction
|
70 |
|
71 |
+
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction. If you are getting 0 redactions, it's possible that the text in the document is saved in image format instead of as selectable text. Select 'Image analysis' on the Settings page in this case.
|
72 |
|
73 |
WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
74 |
|
|
|
132 |
""")
|
133 |
with gr.Accordion("Settings for documents", open = True):
|
134 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
135 |
+
with gr.Row():
|
136 |
+
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
137 |
+
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
138 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
139 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
140 |
|
|
|
163 |
|
164 |
# Document redaction
|
165 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
166 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
|
167 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
|
168 |
|
169 |
# If the output file count text box changes, keep going with redacting each document until done
|
170 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
171 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
|
172 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
|
173 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
174 |
|
|
|
182 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
183 |
|
184 |
#app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
|
185 |
+
# then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
186 |
+
|
187 |
+
|
188 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
|
189 |
+
#then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
|
190 |
|
191 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
192 |
callback = gr.CSVLogger()
|
|
|
212 |
|
213 |
if __name__ == "__main__":
|
214 |
if os.environ['COGNITO_AUTH'] == "1":
|
215 |
+
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
|
216 |
else:
|
217 |
+
app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
opencv-python==4.9.0.80
|
4 |
-
presidio_analyzer==2.2.
|
5 |
-
presidio_anonymizer==2.2.
|
6 |
-
presidio-image-redactor==0.0.
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
9 |
spacy==3.7.5
|
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
opencv-python==4.9.0.80
|
4 |
+
presidio_analyzer==2.2.355
|
5 |
+
presidio_anonymizer==2.2.355
|
6 |
+
presidio-image-redactor==0.0.53
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
9 |
spacy==3.7.5
|
tools/file_conversion.py
CHANGED
@@ -36,7 +36,7 @@ def is_pdf(filename):
|
|
36 |
# %%
|
37 |
## Convert pdf to image if necessary
|
38 |
|
39 |
-
def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
40 |
|
41 |
# Get the number of pages in the PDF
|
42 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
@@ -46,21 +46,26 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
|
46 |
|
47 |
# Open the PDF file
|
48 |
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
49 |
-
for page_num in range(
|
50 |
|
51 |
# print("Current page: ", str(page_num + 1))
|
52 |
|
53 |
# Convert one page to image
|
54 |
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
55 |
|
|
|
56 |
# If no images are returned, break the loop
|
57 |
if not image:
|
58 |
print("Conversion of page", str(page_num), "to file failed.")
|
59 |
break
|
60 |
|
|
|
|
|
|
|
61 |
images.extend(image)
|
62 |
|
63 |
print("PDF has been converted to images.")
|
|
|
64 |
|
65 |
return images
|
66 |
|
@@ -146,7 +151,7 @@ def prepare_image_or_text_pdf(
|
|
146 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
147 |
|
148 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
149 |
-
print("file_paths_loop:", str(file_paths_loop))
|
150 |
|
151 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
152 |
for file in file_paths_loop:
|
@@ -169,7 +174,7 @@ def prepare_image_or_text_pdf(
|
|
169 |
return out_message, out_file_paths
|
170 |
|
171 |
out_file_path = process_file(file_path)
|
172 |
-
print("Out file path at image conversion step:", out_file_path)
|
173 |
|
174 |
elif in_redact_method == "Text analysis":
|
175 |
if is_pdf(file_path) == False:
|
|
|
36 |
# %%
|
37 |
## Convert pdf to image if necessary
|
38 |
|
39 |
+
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
|
40 |
|
41 |
# Get the number of pages in the PDF
|
42 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
|
46 |
|
47 |
# Open the PDF file
|
48 |
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
49 |
+
for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
50 |
|
51 |
# print("Current page: ", str(page_num + 1))
|
52 |
|
53 |
# Convert one page to image
|
54 |
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
55 |
|
56 |
+
|
57 |
# If no images are returned, break the loop
|
58 |
if not image:
|
59 |
print("Conversion of page", str(page_num), "to file failed.")
|
60 |
break
|
61 |
|
62 |
+
# print("Conversion of page", str(page_num), "to file succeeded.")
|
63 |
+
# print("image:", image)
|
64 |
+
|
65 |
images.extend(image)
|
66 |
|
67 |
print("PDF has been converted to images.")
|
68 |
+
# print("Images:", images)
|
69 |
|
70 |
return images
|
71 |
|
|
|
151 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
152 |
|
153 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
154 |
+
#print("file_paths_loop:", str(file_paths_loop))
|
155 |
|
156 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
157 |
for file in file_paths_loop:
|
|
|
174 |
return out_message, out_file_paths
|
175 |
|
176 |
out_file_path = process_file(file_path)
|
177 |
+
#print("Out file path at image conversion step:", out_file_path)
|
178 |
|
179 |
elif in_redact_method == "Text analysis":
|
180 |
if is_pdf(file_path) == False:
|
tools/file_redaction.py
CHANGED
@@ -18,7 +18,7 @@ from tools.data_anonymise import generate_decision_process_output
|
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
@@ -73,7 +73,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
73 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
74 |
|
75 |
print("Redacting file as image-based file")
|
76 |
-
pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
|
77 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
78 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
79 |
|
@@ -97,7 +97,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
97 |
|
98 |
# Analyse text-based pdf
|
99 |
print('Redacting file as text-based PDF')
|
100 |
-
pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
101 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
102 |
pdf_text.save(out_text_file_path)
|
103 |
|
@@ -175,12 +175,13 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
|
175 |
merged_bboxes.append(merged_box)
|
176 |
return merged_bboxes
|
177 |
|
178 |
-
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
|
179 |
'''
|
180 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
181 |
'''
|
182 |
|
183 |
fill = (0, 0, 0)
|
|
|
184 |
|
185 |
if not image_paths:
|
186 |
out_message = "PDF does not exist as images. Converting pages to image"
|
@@ -190,59 +191,101 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
190 |
image_paths = process_file(file_path)
|
191 |
|
192 |
images = []
|
193 |
-
|
|
|
|
|
|
|
|
|
194 |
|
195 |
out_message = "Redacting pages"
|
196 |
print(out_message)
|
197 |
#progress(0.1, desc=out_message)
|
198 |
|
199 |
-
#
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
# Get the image to redact using PIL lib (pillow)
|
205 |
-
#print("image_paths:", image_paths)
|
206 |
|
207 |
-
|
|
|
|
|
|
|
208 |
|
209 |
-
|
210 |
-
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
211 |
-
engine = ImageRedactorEngine(image_analyser)
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
|
247 |
images.append(image)
|
248 |
|
@@ -358,7 +401,7 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
|
|
358 |
annotations_on_page.append(annotation)
|
359 |
return annotations_on_page
|
360 |
|
361 |
-
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
362 |
'''
|
363 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
364 |
'''
|
@@ -370,13 +413,30 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
370 |
pdf = Pdf.open(filename)
|
371 |
page_num = 0
|
372 |
|
373 |
-
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
annotations_on_page = []
|
377 |
decision_process_table_on_page = []
|
378 |
|
379 |
-
for page_layout in extract_pages(filename, page_numbers = [
|
380 |
|
381 |
page_analyzer_results = []
|
382 |
page_analyzed_bounding_boxes = []
|
@@ -403,8 +463,8 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
403 |
annotations_all_pages.extend([annotations_on_page])
|
404 |
decision_process_table_all_pages.extend([decision_process_table_on_page])
|
405 |
|
406 |
-
print("For page number:",
|
407 |
|
408 |
-
page_num += 1
|
409 |
|
410 |
return pdf, decision_process_table_all_pages
|
|
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
|
|
73 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
74 |
|
75 |
print("Redacting file as image-based file")
|
76 |
+
pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max)
|
77 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
78 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
79 |
|
|
|
97 |
|
98 |
# Analyse text-based pdf
|
99 |
print('Redacting file as text-based PDF')
|
100 |
+
pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max)
|
101 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
102 |
pdf_text.save(out_text_file_path)
|
103 |
|
|
|
175 |
merged_bboxes.append(merged_box)
|
176 |
return merged_bboxes
|
177 |
|
178 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
|
179 |
'''
|
180 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
181 |
'''
|
182 |
|
183 |
fill = (0, 0, 0)
|
184 |
+
decision_process_output_str = ""
|
185 |
|
186 |
if not image_paths:
|
187 |
out_message = "PDF does not exist as images. Converting pages to image"
|
|
|
191 |
image_paths = process_file(file_path)
|
192 |
|
193 |
images = []
|
194 |
+
|
195 |
+
#print("Image paths:", image_paths)
|
196 |
+
number_of_pages = len(image_paths[0])
|
197 |
+
|
198 |
+
print("Number of pages:", str(number_of_pages))
|
199 |
|
200 |
out_message = "Redacting pages"
|
201 |
print(out_message)
|
202 |
#progress(0.1, desc=out_message)
|
203 |
|
204 |
+
# Check that page_min and page_max are within expected ranges
|
205 |
+
if page_max > number_of_pages or page_max == 0:
|
206 |
+
page_max = number_of_pages
|
207 |
+
#else:
|
208 |
+
# page_max = page_max - 1
|
|
|
|
|
209 |
|
210 |
+
if page_min <= 0:
|
211 |
+
page_min = 0
|
212 |
+
else:
|
213 |
+
page_min = page_min - 1
|
214 |
|
215 |
+
print("Page range:", str(page_min), "to", str(page_max))
|
|
|
|
|
216 |
|
217 |
+
#for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
218 |
+
|
219 |
+
for n in range(0, number_of_pages):
|
220 |
+
|
221 |
+
try:
|
222 |
+
image = image_paths[0][n]#.copy()
|
223 |
+
print("Skipping page", str(n))
|
224 |
+
#print("image:", image)
|
225 |
+
except Exception as e:
|
226 |
+
print("Could not redact page:", str(i), "due to:")
|
227 |
+
print(e)
|
228 |
+
continue
|
229 |
+
|
230 |
+
if n >= page_min and n <= page_max:
|
231 |
+
#for i in range(page_min, page_max):
|
232 |
+
|
233 |
+
i = n
|
234 |
+
|
235 |
+
print("Redacting page", str(i))
|
236 |
+
|
237 |
+
# Get the image to redact using PIL lib (pillow)
|
238 |
+
#print("image_paths:", image_paths)
|
239 |
+
|
240 |
+
#image = ImageChops.duplicate(image_paths[i])
|
241 |
+
#print("Image paths i:", image_paths[0])
|
242 |
+
|
243 |
+
# Assuming image_paths[i] is your PIL image object
|
244 |
+
try:
|
245 |
+
image = image_paths[0][i]#.copy()
|
246 |
+
#print("image:", image)
|
247 |
+
except Exception as e:
|
248 |
+
print("Could not redact page:", str(i), "due to:")
|
249 |
+
print(e)
|
250 |
+
continue
|
251 |
+
|
252 |
+
# %%
|
253 |
+
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
254 |
+
engine = ImageRedactorEngine(image_analyser)
|
255 |
+
|
256 |
+
if language == 'en':
|
257 |
+
ocr_lang = 'eng'
|
258 |
+
else: ocr_lang = language
|
259 |
+
|
260 |
+
bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
|
261 |
+
**{
|
262 |
+
"allow_list": allow_list,
|
263 |
+
"language": language,
|
264 |
+
"entities": chosen_redact_entities,
|
265 |
+
"score_threshold": score_threshold,
|
266 |
+
"return_decision_process":True,
|
267 |
+
})
|
268 |
+
|
269 |
+
# Text placeholder in this processing step, as the analyze method does not return the OCR text
|
270 |
+
if bboxes:
|
271 |
+
decision_process_output_str = str(bboxes)
|
272 |
+
print("Decision process:", decision_process_output_str)
|
273 |
+
|
274 |
+
#print("For page: ", str(i), "Bounding boxes: ", bboxes)
|
275 |
|
276 |
+
draw = ImageDraw.Draw(image)
|
277 |
+
|
278 |
+
merged_bboxes = merge_img_bboxes(bboxes)
|
279 |
|
280 |
+
#print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
|
281 |
|
282 |
+
# 3. Draw the merged boxes (unchanged)
|
283 |
+
for box in merged_bboxes:
|
284 |
+
x0 = box.left
|
285 |
+
y0 = box.top
|
286 |
+
x1 = x0 + box.width
|
287 |
+
y1 = y0 + box.height
|
288 |
+
draw.rectangle([x0, y0, x1, y1], fill=fill)
|
289 |
|
290 |
images.append(image)
|
291 |
|
|
|
401 |
annotations_on_page.append(annotation)
|
402 |
return annotations_on_page
|
403 |
|
404 |
+
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
|
405 |
'''
|
406 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
407 |
'''
|
|
|
413 |
pdf = Pdf.open(filename)
|
414 |
page_num = 0
|
415 |
|
416 |
+
number_of_pages = len(pdf.pages)
|
417 |
+
|
418 |
+
# Check that page_min and page_max are within expected ranges
|
419 |
+
if page_max > number_of_pages or page_max == 0:
|
420 |
+
page_max = number_of_pages
|
421 |
+
#else:
|
422 |
+
# page_max = page_max - 1
|
423 |
+
|
424 |
+
if page_min <= 0:
|
425 |
+
page_min = 0
|
426 |
+
else:
|
427 |
+
page_min = page_min - 1
|
428 |
+
|
429 |
+
print("Page range is",str(page_min), "to", str(page_max))
|
430 |
+
|
431 |
+
for page_no in range(page_min, page_max):
|
432 |
+
page = pdf.pages[page_no]
|
433 |
+
|
434 |
+
print("Page number is:", page_no)
|
435 |
|
436 |
annotations_on_page = []
|
437 |
decision_process_table_on_page = []
|
438 |
|
439 |
+
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
440 |
|
441 |
page_analyzer_results = []
|
442 |
page_analyzed_bounding_boxes = []
|
|
|
463 |
annotations_all_pages.extend([annotations_on_page])
|
464 |
decision_process_table_all_pages.extend([decision_process_table_on_page])
|
465 |
|
466 |
+
print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
|
467 |
|
468 |
+
#page_num += 1
|
469 |
|
470 |
return pdf, decision_process_table_all_pages
|