Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Apr 29

Commit

643a230

unverified ·

2 Parent(s): d998102 36f8e9f

Merge pull request #19 from seanpedrick-case/dev

Browse files

Files changed (6) hide show

DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec +1 -1
pyproject.toml +1 -1
tools/config.py +1 -1
tools/file_conversion.py +3 -2
tools/file_redaction.py +33 -40
tools/redaction_review.py +1 -1

DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec RENAMED Viewed

@@ -62,5 +62,5 @@ coll = COLLECT(
     strip=False,
     upx=True,
     upx_exclude=[],
-    name='DocRedactApp_0.4.0',
 )

     strip=False,
     upx=True,
     upx_exclude=[],
+    name='DocRedactApp_0.6.1',
 )

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction" # Your application's name
-version = "0.6.0" # Your application's current version
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
 readme = "README.md" # Path to your project's README file
 requires-python = ">=3.10" # The minimum Python version required

 [project]
 name = "doc_redaction" # Your application's name
+version = "0.6.1" # Your application's current version
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
 readme = "README.md" # Path to your project's README file
 requires-python = ">=3.10" # The minimum Python version required

tools/config.py CHANGED Viewed

@@ -249,7 +249,7 @@ else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
 SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
-GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'True')
 DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')

 SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
+GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
 DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')

tools/file_conversion.py CHANGED Viewed

@@ -1319,6 +1319,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     for col in essential_box_cols:
         if col not in final_df.columns:
             final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
     base_cols = ["image"]
     extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
@@ -1328,8 +1329,8 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
     # but it's good practice if columns could be missing for other reasons.
     final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
-    final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
     return final_df

     for col in essential_box_cols:
         if col not in final_df.columns:
             final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
+        final_df[col] = final_df[col].replace({None: pd.NA})
     base_cols = ["image"]
     extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
     # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
     # but it's good practice if columns could be missing for other reasons.
     final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
+    final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how="all")
+    final_df.replace({None: pd.NA})
     return final_df

tools/file_redaction.py CHANGED Viewed

@@ -536,7 +536,11 @@ def choose_and_run_redactor(file_paths:List[str],
                 if is_pdf(file_path) == False:
                     out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
                     # pymupdf_doc is an image list in this case
-                    img = Image.open(pymupdf_doc[-1])
                     img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
                     #
                 else:
@@ -562,13 +566,14 @@ def choose_and_run_redactor(file_paths:List[str],
             # Convert annotations_all_pages to a consistent relative coordinate format output
             page_sizes = page_sizes_df.to_dict(orient="records")
             all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
-            all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
-            annotations_all_pages = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
-            annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
             # Save the gradio_annotation_boxes to a review csv file
-            review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
             # Don't need page sizes in outputs
             review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
@@ -625,7 +630,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if total_textract_query_number > number_of_pages:
         total_textract_query_number = number_of_pages
-    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -1352,33 +1357,15 @@ def redact_image_pdf(file_path:str,
             # If using Tesseract
             if text_extraction_method == tesseract_ocr_option:
-                #print("image_path:", image_path)
-                #print("print(type(image_path)):", print(type(image_path)))
-                #if not isinstance(image_path, image_path.image_path) or not isinstance(image_path, str): raise Exception("image_path object for page", reported_page_number, "not found, cannot perform local OCR analysis.")
-                # Check for existing page_line_level_ocr_results_with_words object:
-                # page_line_level_ocr_results = (
-                # all_page_line_level_ocr_results.get('results', [])
-                # if all_page_line_level_ocr_results.get('page') == reported_page_number
-                # else []
-                # )
                 if all_page_line_level_ocr_results_with_words:
                     # Find the first dict where 'page' matches
-                    #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
-                    print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
-                    #print("Looking for page:", reported_page_number)
                     matching_page = next(
                     (item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
                     None
                     )
-                    #print("matching_page:", matching_page)
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
                 else: page_line_level_ocr_results_with_words = []
@@ -1388,12 +1375,9 @@ def redact_image_pdf(file_path:str,
                 else:
                     page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
-                    print("page_word_level_ocr_results:", page_word_level_ocr_results)
                     page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
-                    print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
@@ -1471,7 +1455,6 @@ def redact_image_pdf(file_path:str,
             all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
             if pii_identification_method != no_redaction_option:
                 # Step 2: Analyse text and identify PII
                 if chosen_redact_entities or chosen_redact_comprehend_entities:
@@ -1486,7 +1469,7 @@ def redact_image_pdf(file_path:str,
                         entities=chosen_redact_entities,
                         allow_list=allow_list,
                         score_threshold=score_threshold
-                    )
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
@@ -1519,20 +1502,20 @@ def redact_image_pdf(file_path:str,
                     # Assume image_path is an image
                     image = image_path
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
                 all_image_annotations_boxes = []
                 for box in page_merged_redaction_bboxes:
                     try:
                         x0 = box.left
                         y0 = box.top
                         x1 = x0 + box.width
                         y1 = y0 + box.height
                         label = box.entity_type  # Attempt to get the label
                     except AttributeError as e:
                         print(f"Error accessing box attributes: {e}")
                         label = "Redaction"  # Default label if there's an error
@@ -1542,15 +1525,19 @@ def redact_image_pdf(file_path:str,
                         print(f"Invalid coordinates for box: {box}")
                         continue  # Skip this box if coordinates are invalid
-                    # Directly append the dictionary with the required keys
-                    all_image_annotations_boxes.append({
                         "xmin": x0,
                         "ymin": y0,
                         "xmax": x1,
                         "ymax": y1,
                         "label": label,
-                        "color": (0, 0, 0)
-                    })
                     # Draw the rectangle
                     try:
@@ -1558,7 +1545,13 @@ def redact_image_pdf(file_path:str,
                     except Exception as e:
                         print(f"Error drawing rectangle: {e}")
-                page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
@@ -1577,7 +1570,7 @@ def redact_image_pdf(file_path:str,
             all_pages_decision_process_table_list.append(decision_process_table)
             decision_process_table = fill_missing_ids(decision_process_table)
-            #decision_process_table.to_csv("output/decision_process_table_with_ids.csv")
             toc = time.perf_counter()
@@ -1591,7 +1584,7 @@ def redact_image_pdf(file_path:str,
                 tqdm._instances.clear()
                 if is_pdf(file_path) == False:
-                    pdf_image_file_paths.append(image_path)
                     pymupdf_doc = pdf_image_file_paths
                 # Check if the image_path already exists in annotations_all_pages
@@ -1604,7 +1597,6 @@ def redact_image_pdf(file_path:str,
                     annotations_all_pages.append(page_image_annotations)
                 if text_extraction_method == textract_option:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
@@ -1626,13 +1618,14 @@ def redact_image_pdf(file_path:str,
                 all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
                 all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                 current_loop_page += 1
                 return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
         # If it's an image file
         if is_pdf(file_path) == False:
-            pdf_image_file_paths.append(image_path)
             pymupdf_doc = pdf_image_file_paths
         # Check if the image_path already exists in annotations_all_pages

                 if is_pdf(file_path) == False:
                     out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
                     # pymupdf_doc is an image list in this case
+                    if isinstance(pymupdf_doc[-1], str):
+                        img = Image.open(pymupdf_doc[-1])
+                    # Otherwise could be an image object
+                    else:
+                        img = pymupdf_doc[-1]
                     img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
                     #
                 else:
             # Convert annotations_all_pages to a consistent relative coordinate format output
             page_sizes = page_sizes_df.to_dict(orient="records")
             all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
+            all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
+            annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
+            annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
             # Save the gradio_annotation_boxes to a review csv file
+            review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
             # Don't need page sizes in outputs
             review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
     if total_textract_query_number > number_of_pages:
         total_textract_query_number = number_of_pages
+    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
             # If using Tesseract
             if text_extraction_method == tesseract_ocr_option:
                 if all_page_line_level_ocr_results_with_words:
                     # Find the first dict where 'page' matches
                     matching_page = next(
                     (item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
                     None
                     )
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
                 else: page_line_level_ocr_results_with_words = []
                 else:
                     page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
                     page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
             all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
             if pii_identification_method != no_redaction_option:
                 # Step 2: Analyse text and identify PII
                 if chosen_redact_entities or chosen_redact_comprehend_entities:
                         entities=chosen_redact_entities,
                         allow_list=allow_list,
                         score_threshold=score_threshold
+                    )
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
                     # Assume image_path is an image
                     image = image_path
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
                 all_image_annotations_boxes = []
                 for box in page_merged_redaction_bboxes:
                     try:
                         x0 = box.left
                         y0 = box.top
                         x1 = x0 + box.width
                         y1 = y0 + box.height
                         label = box.entity_type  # Attempt to get the label
+                        text = box.text
                     except AttributeError as e:
                         print(f"Error accessing box attributes: {e}")
                         label = "Redaction"  # Default label if there's an error
                         print(f"Invalid coordinates for box: {box}")
                         continue  # Skip this box if coordinates are invalid
+                    img_annotation_box = {
                         "xmin": x0,
                         "ymin": y0,
                         "xmax": x1,
                         "ymax": y1,
                         "label": label,
+                        "color": (0, 0, 0),
+                        "text": text
+                    }
+                    img_annotation_box = fill_missing_box_ids(img_annotation_box)
+                    # Directly append the dictionary with the required keys
+                    all_image_annotations_boxes.append(img_annotation_box)
                     # Draw the rectangle
                     try:
                     except Exception as e:
                         print(f"Error drawing rectangle: {e}")
+                page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
+                print("page_image_annotations at box drawing:", page_image_annotations)
+                redacted_image = image.copy()
+                #redacted_image.save("test_out_image.png")
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
             all_pages_decision_process_table_list.append(decision_process_table)
             decision_process_table = fill_missing_ids(decision_process_table)
+            decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
             toc = time.perf_counter()
                 tqdm._instances.clear()
                 if is_pdf(file_path) == False:
+                    pdf_image_file_paths.append(redacted_image) # .append(image_path)
                     pymupdf_doc = pdf_image_file_paths
                 # Check if the image_path already exists in annotations_all_pages
                     annotations_all_pages.append(page_image_annotations)
                 if text_extraction_method == textract_option:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
                 all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
                 all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                 current_loop_page += 1
                 return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
         # If it's an image file
         if is_pdf(file_path) == False:
+            pdf_image_file_paths.append(redacted_image)#.append(image_path)
             pymupdf_doc = pdf_image_file_paths
         # Check if the image_path already exists in annotations_all_pages

tools/redaction_review.py CHANGED Viewed

@@ -276,7 +276,7 @@ def update_annotator_page_from_review_df(
                 match = re.search(r"(\d+)\.png$", page_state_entry['image'])
                 if match: page_no = int(match.group(1))
-                else: page_no = -1
                 if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
                     # Replace the annotations list for this page with the new list from review_df

                 match = re.search(r"(\d+)\.png$", page_state_entry['image'])
                 if match: page_no = int(match.group(1))
+                else: page_no = 0
                 if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
                     # Replace the annotations list for this page with the new list from review_df