Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on May 15, 2024

Commit

2807627

1 Parent(s): 19846ba

Fixed some more input bugs

Browse files

Files changed (3) hide show

app.py +5 -4
tools/file_conversion.py +3 -3
tools/file_redaction.py +20 -32

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ with block:
     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
     gr.Markdown(
     """
@@ -61,13 +62,13 @@ with block:
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
-    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
     then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
-                    outputs=[output_summary, output_file], api_name="redact")
-    convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file],
-                    outputs=[output_summary, output_file], api_name="redact")
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",

     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
+    output_file_list_state = gr.State([])
     gr.Markdown(
     """
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
     then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
+                    outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
+    convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
+                    outputs=[output_summary, output_file])
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",

tools/file_conversion.py CHANGED Viewed

@@ -86,7 +86,7 @@ def process_file(file_path):
     return out_path
-def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
     out_message = ''
     out_file_paths = []
@@ -119,11 +119,11 @@ def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str,
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)
-    out_file_paths = []
     # Convert annotated text pdf back to image to give genuine redactions
     print("Creating image version of results")
-    pdf_text_image_paths = process_file(out_text_file_path)
     out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])

     return out_path
+def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
     out_message = ''
     out_file_paths = []
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)
+    out_file_paths = out_text_file_path
     # Convert annotated text pdf back to image to give genuine redactions
     print("Creating image version of results")
+    pdf_text_image_paths = process_file(out_text_file_path[0])
     out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])

tools/file_redaction.py CHANGED Viewed

@@ -21,7 +21,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
     out_message = ''
     out_file_paths = []
-    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     if file_path:
          file_path_without_ext = get_file_path_end(file_path)
@@ -35,7 +36,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
         # if is_pdf_or_image(file_path) == False:
         #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
         out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
         pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
@@ -53,9 +54,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
         out_file_paths.append(out_text_file_path)
     else:
         out_message = "No redaction method selected"
         print(out_message)
@@ -67,19 +67,21 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
     out_message = out_message + "\n\n" + out_time
-    return out_message, out_file_paths
-def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
     '''
-    out_message = "Converting pages to image"
-    print(out_message)
-    progress(0, desc=out_message)
-    image_paths = process_file(file_path)
     # Create a new PDF
     #pdf = pikepdf.new()
@@ -136,7 +138,10 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     pdf = Pdf.open(filename)
-    for page_num, page in progress.tqdm(enumerate(pdf.pages), total=len(pdf.pages), unit="pages", desc="Redacting pages"):
         print("Page number is: ", page_num)
@@ -169,25 +174,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                             if isinstance(line, LTTextLine)    # Check if the line is an instance of LTTextLine
                             for char in line]                   # Loop through each character in the line
                             #if isinstance(char, LTChar)]  # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
-                    #print(characters)
-                    # Collect unique types
-                    # unique_types = set()
-                    # for line in text_container:
-                    #     if isinstance(line, LTTextLine):
-                    #         print("Line: ", line)
-                    #         for char in line:
-                    #             unique_types.add(type(char))
-                    #             if isinstance(char, LTAnno):
-                    #                 print(char)
-                    # # Print the unique types
-                    # print("Unique types in text_container:")
-                    # for t in unique_types:
-                    #     print(t)
                     # If any results found
                     print(analyzer_results)
@@ -216,13 +202,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                     CA=1, # Transparency
                     T=analyzed_bounding_box["result"].entity_type
                 )
-                annotations_on_page.append(annotation)
             annotations_all_pages.extend([annotations_on_page])
             print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
             page.Annots = pdf.make_indirect(annotations_on_page)
         # Extracting data from dictionaries
         # extracted_data = []
         # for item in annotations_all_pages:

     out_message = ''
     out_file_paths = []
+    if in_allow_list:
+        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     if file_path:
          file_path_without_ext = get_file_path_end(file_path)
         # if is_pdf_or_image(file_path) == False:
         #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+        pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
         out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
         pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
         out_file_paths.append(out_text_file_path)
+        out_message = "Text-based PDF successfully redacted and saved to file."
     else:
         out_message = "No redaction method selected"
         print(out_message)
     out_message = out_message + "\n\n" + out_time
+    return out_message, out_file_paths, out_file_paths
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
     '''
+    if not image_paths:
+        out_message = "PDF does not exist as images. Converting pages to image"
+        print(out_message)
+        progress(0, desc=out_message)
+        image_paths = process_file(file_path)
     # Create a new PDF
     #pdf = pikepdf.new()
     pdf = Pdf.open(filename)
+    page_num = 0
+    for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
         print("Page number is: ", page_num)
                             if isinstance(line, LTTextLine)    # Check if the line is an instance of LTTextLine
                             for char in line]                   # Loop through each character in the line
                             #if isinstance(char, LTChar)]  # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
                     # If any results found
                     print(analyzer_results)
                     CA=1, # Transparency
                     T=analyzed_bounding_box["result"].entity_type
                 )
+                annotations_on_page.append(annotation)
             annotations_all_pages.extend([annotations_on_page])
             print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
             page.Annots = pdf.make_indirect(annotations_on_page)
+            page_num += 1
         # Extracting data from dictionaries
         # extracted_data = []
         # for item in annotations_all_pages: