Commit
·
2807627
1
Parent(s):
19846ba
Fixed some more input bugs
Browse files- app.py +5 -4
- tools/file_conversion.py +3 -3
- tools/file_redaction.py +20 -32
app.py
CHANGED
|
@@ -20,6 +20,7 @@ with block:
|
|
| 20 |
|
| 21 |
prepared_pdf_state = gr.State([])
|
| 22 |
output_image_files_state = gr.State([])
|
|
|
|
| 23 |
|
| 24 |
gr.Markdown(
|
| 25 |
"""
|
|
@@ -61,13 +62,13 @@ with block:
|
|
| 61 |
### Loading AWS data ###
|
| 62 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
| 63 |
|
| 64 |
-
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file,
|
| 65 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
| 66 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
| 67 |
-
outputs=[output_summary, output_file], api_name="redact")
|
| 68 |
|
| 69 |
-
convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file,
|
| 70 |
-
outputs=[output_summary, output_file]
|
| 71 |
|
| 72 |
# Simple run for HF spaces or local on your computer
|
| 73 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
|
|
|
| 20 |
|
| 21 |
prepared_pdf_state = gr.State([])
|
| 22 |
output_image_files_state = gr.State([])
|
| 23 |
+
output_file_list_state = gr.State([])
|
| 24 |
|
| 25 |
gr.Markdown(
|
| 26 |
"""
|
|
|
|
| 62 |
### Loading AWS data ###
|
| 63 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
| 64 |
|
| 65 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
| 66 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
| 67 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
| 68 |
+
outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
|
| 69 |
|
| 70 |
+
convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
| 71 |
+
outputs=[output_summary, output_file])
|
| 72 |
|
| 73 |
# Simple run for HF spaces or local on your computer
|
| 74 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
tools/file_conversion.py
CHANGED
|
@@ -86,7 +86,7 @@ def process_file(file_path):
|
|
| 86 |
|
| 87 |
return out_path
|
| 88 |
|
| 89 |
-
def prepare_image_or_text_pdf(file_path:str,
|
| 90 |
|
| 91 |
out_message = ''
|
| 92 |
out_file_paths = []
|
|
@@ -119,11 +119,11 @@ def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str,
|
|
| 119 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
| 120 |
file_path_without_ext = get_file_path_end(in_file_path)
|
| 121 |
|
| 122 |
-
out_file_paths =
|
| 123 |
|
| 124 |
# Convert annotated text pdf back to image to give genuine redactions
|
| 125 |
print("Creating image version of results")
|
| 126 |
-
pdf_text_image_paths = process_file(out_text_file_path)
|
| 127 |
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
| 128 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
| 129 |
|
|
|
|
| 86 |
|
| 87 |
return out_path
|
| 88 |
|
| 89 |
+
def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
|
| 90 |
|
| 91 |
out_message = ''
|
| 92 |
out_file_paths = []
|
|
|
|
| 119 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
| 120 |
file_path_without_ext = get_file_path_end(in_file_path)
|
| 121 |
|
| 122 |
+
out_file_paths = out_text_file_path
|
| 123 |
|
| 124 |
# Convert annotated text pdf back to image to give genuine redactions
|
| 125 |
print("Creating image version of results")
|
| 126 |
+
pdf_text_image_paths = process_file(out_text_file_path[0])
|
| 127 |
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
| 128 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
| 129 |
|
tools/file_redaction.py
CHANGED
|
@@ -21,7 +21,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
| 21 |
out_message = ''
|
| 22 |
out_file_paths = []
|
| 23 |
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
if file_path:
|
| 27 |
file_path_without_ext = get_file_path_end(file_path)
|
|
@@ -35,7 +36,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
| 35 |
# if is_pdf_or_image(file_path) == False:
|
| 36 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
| 37 |
|
| 38 |
-
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
| 39 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
| 40 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
| 41 |
|
|
@@ -53,9 +54,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
| 53 |
|
| 54 |
out_file_paths.append(out_text_file_path)
|
| 55 |
|
| 56 |
-
|
| 57 |
|
| 58 |
-
|
| 59 |
else:
|
| 60 |
out_message = "No redaction method selected"
|
| 61 |
print(out_message)
|
|
@@ -67,19 +67,21 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
| 67 |
|
| 68 |
out_message = out_message + "\n\n" + out_time
|
| 69 |
|
| 70 |
-
return out_message, out_file_paths
|
| 71 |
|
| 72 |
|
| 73 |
-
def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
| 74 |
'''
|
| 75 |
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
| 76 |
'''
|
| 77 |
|
| 78 |
-
|
| 79 |
-
print(out_message)
|
| 80 |
-
progress(0, desc=out_message)
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Create a new PDF
|
| 85 |
#pdf = pikepdf.new()
|
|
@@ -136,7 +138,10 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
| 136 |
|
| 137 |
pdf = Pdf.open(filename)
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
print("Page number is: ", page_num)
|
| 142 |
|
|
@@ -169,25 +174,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
| 169 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
| 170 |
for char in line] # Loop through each character in the line
|
| 171 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
#print(characters)
|
| 175 |
-
|
| 176 |
-
# Collect unique types
|
| 177 |
-
# unique_types = set()
|
| 178 |
-
|
| 179 |
-
# for line in text_container:
|
| 180 |
-
# if isinstance(line, LTTextLine):
|
| 181 |
-
# print("Line: ", line)
|
| 182 |
-
# for char in line:
|
| 183 |
-
# unique_types.add(type(char))
|
| 184 |
-
# if isinstance(char, LTAnno):
|
| 185 |
-
# print(char)
|
| 186 |
-
|
| 187 |
-
# # Print the unique types
|
| 188 |
-
# print("Unique types in text_container:")
|
| 189 |
-
# for t in unique_types:
|
| 190 |
-
# print(t)
|
| 191 |
|
| 192 |
# If any results found
|
| 193 |
print(analyzer_results)
|
|
@@ -216,13 +202,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
| 216 |
CA=1, # Transparency
|
| 217 |
T=analyzed_bounding_box["result"].entity_type
|
| 218 |
)
|
| 219 |
-
annotations_on_page.append(annotation)
|
| 220 |
|
| 221 |
annotations_all_pages.extend([annotations_on_page])
|
| 222 |
-
|
| 223 |
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
|
| 224 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
| 225 |
|
|
|
|
|
|
|
| 226 |
# Extracting data from dictionaries
|
| 227 |
# extracted_data = []
|
| 228 |
# for item in annotations_all_pages:
|
|
|
|
| 21 |
out_message = ''
|
| 22 |
out_file_paths = []
|
| 23 |
|
| 24 |
+
if in_allow_list:
|
| 25 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 26 |
|
| 27 |
if file_path:
|
| 28 |
file_path_without_ext = get_file_path_end(file_path)
|
|
|
|
| 36 |
# if is_pdf_or_image(file_path) == False:
|
| 37 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
| 38 |
|
| 39 |
+
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
| 40 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
| 41 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
| 42 |
|
|
|
|
| 54 |
|
| 55 |
out_file_paths.append(out_text_file_path)
|
| 56 |
|
| 57 |
+
out_message = "Text-based PDF successfully redacted and saved to file."
|
| 58 |
|
|
|
|
| 59 |
else:
|
| 60 |
out_message = "No redaction method selected"
|
| 61 |
print(out_message)
|
|
|
|
| 67 |
|
| 68 |
out_message = out_message + "\n\n" + out_time
|
| 69 |
|
| 70 |
+
return out_message, out_file_paths, out_file_paths
|
| 71 |
|
| 72 |
|
| 73 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
| 74 |
'''
|
| 75 |
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
| 76 |
'''
|
| 77 |
|
| 78 |
+
if not image_paths:
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
out_message = "PDF does not exist as images. Converting pages to image"
|
| 81 |
+
print(out_message)
|
| 82 |
+
progress(0, desc=out_message)
|
| 83 |
+
|
| 84 |
+
image_paths = process_file(file_path)
|
| 85 |
|
| 86 |
# Create a new PDF
|
| 87 |
#pdf = pikepdf.new()
|
|
|
|
| 138 |
|
| 139 |
pdf = Pdf.open(filename)
|
| 140 |
|
| 141 |
+
page_num = 0
|
| 142 |
+
|
| 143 |
+
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
| 144 |
+
|
| 145 |
|
| 146 |
print("Page number is: ", page_num)
|
| 147 |
|
|
|
|
| 174 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
| 175 |
for char in line] # Loop through each character in the line
|
| 176 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
# If any results found
|
| 179 |
print(analyzer_results)
|
|
|
|
| 202 |
CA=1, # Transparency
|
| 203 |
T=analyzed_bounding_box["result"].entity_type
|
| 204 |
)
|
| 205 |
+
annotations_on_page.append(annotation)
|
| 206 |
|
| 207 |
annotations_all_pages.extend([annotations_on_page])
|
| 208 |
+
|
| 209 |
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
|
| 210 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
| 211 |
|
| 212 |
+
page_num += 1
|
| 213 |
+
|
| 214 |
# Extracting data from dictionaries
|
| 215 |
# extracted_data = []
|
| 216 |
# for item in annotations_all_pages:
|