Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 13

Commit

11770c9

1 Parent(s): eafaaed

Fixed redaction of image files

Browse files

Files changed (2) hide show

tools/file_conversion.py +9 -28
tools/file_redaction.py +3 -41

tools/file_conversion.py CHANGED Viewed

@@ -478,11 +478,12 @@ def prepare_image_or_pdf(
                     annotation["image"] = image_path
                     all_annotations_object.append(annotation)
-                #print("all_annotations_object:", all_annotations_object)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
@@ -491,10 +492,12 @@ def prepare_image_or_pdf(
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
-        # Check if the file is an image type and the user selected text ocr option
-        elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
-            in_redact_method = tesseract_ocr_option
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
@@ -618,12 +621,7 @@ def prepare_image_or_pdf(
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
-    #if prepare_for_review == False:
     number_of_pages = len(image_file_paths)
-    #else:
-    #    number_of_pages = len(all_annotations_object)
-    #print("all_annotations_object at end:", all_annotations_object)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
@@ -650,23 +648,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     return out_message, out_file_paths
-# Example DataFrames
-# df1 = pd.DataFrame({
-#     'xmin': [10, 20, 30],
-#     'xmax': [15, 25, 35],
-#     'ymin': [40, 50, 60],
-#     'ymax': [45, 55, 65],
-#     'info1': ['A', 'B', 'C']
-# })
-# df2 = pd.DataFrame({
-#     'xmin': [12, 18, 32],
-#     'xmax': [14, 24, 34],
-#     'ymin': [42, 48, 62],
-#     'ymax': [44, 54, 66],
-#     'info2': ['X', 'Y', 'Z']
-# })
 def join_values_within_threshold(df1, df2):
     # Threshold for matching
     threshold = 5

                     annotation["image"] = image_path
                     all_annotations_object.append(annotation)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
+            # Check if the file is an image type and the user selected text ocr option
+            if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
+                in_redact_method = tesseract_ocr_option
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
+            file_path_str = str(file_path)
+            image_file_paths = process_file(file_path_str, prepare_for_review)
+            print("Inserted image into PDF file")
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
     number_of_pages = len(image_file_paths)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
     return out_message, out_file_paths
 def join_values_within_threshold(df1, df2):
     # Threshold for matching
     threshold = 5

tools/file_redaction.py CHANGED Viewed

@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file:", file_path_without_ext)
             is_a_pdf = is_pdf(file_path) == True
-            if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = tesseract_ocr_option
@@ -753,8 +753,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
@@ -767,6 +765,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
             merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
         if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -1085,44 +1085,6 @@ def redact_image_pdf(file_path:str,
                         text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
-                # if not os.path.exists(json_file_path):
-                #     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                #     log_files_output_paths.append(json_file_path)
-                #     request_metadata = request_metadata + "\n" + new_request_metadata
-                #     existing_data = {"pages":[text_blocks]}
-                # else:
-                #     # Open the file and load the JSON data
-                #     print("Found existing Textract json results file.")
-                #     with open(json_file_path, 'r') as json_file:
-                #         existing_data = json.load(json_file)
-                #         # Check if the current reported_page_number exists in the loaded JSON
-                #         page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
-                #         if not page_exists:  # If the page does not exist, analyze again
-                #             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
-                #             text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                #             # Check if "pages" key exists, if not, initialize it as an empty list
-                #             if "pages" not in existing_data:
-                #                 existing_data["pages"] = []
-                #             # Append the new page data
-                #             existing_data["pages"].append(text_blocks)
-                #             # Write the updated existing_data back to the JSON file
-                #             with open(json_file_path, 'w') as json_file:
-                #                 json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                #             log_files_output_paths.append(json_file_path)
-                #             request_metadata = request_metadata + "\n" + new_request_metadata
-                #         else:
-                #             # If the page exists, retrieve the data
-                #             text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII

             print("Redacting file:", file_path_without_ext)
             is_a_pdf = is_pdf(file_path) == True
+            if is_a_pdf == False and in_redact_method == text_ocr_option:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = tesseract_ocr_option
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
+            print("handwriting_recogniser_results:", handwriting_recogniser_results)
             merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
         if "Redact all identified signatures" in handwrite_signature_checkbox:
                         text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII