seanpedrickcase commited on
Commit
36f8e9f
·
1 Parent(s): 47a3a80

Fix for image file redaction

Browse files
DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec RENAMED
@@ -62,5 +62,5 @@ coll = COLLECT(
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
- name='DocRedactApp_0.4.0',
66
  )
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp_0.6.1',
66
  )
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction" # Your application's name
7
- version = "0.6.0" # Your application's current version
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
9
  readme = "README.md" # Path to your project's README file
10
  requires-python = ">=3.10" # The minimum Python version required
 
4
 
5
  [project]
6
  name = "doc_redaction" # Your application's name
7
+ version = "0.6.1" # Your application's current version
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
9
  readme = "README.md" # Path to your project's README file
10
  requires-python = ">=3.10" # The minimum Python version required
tools/config.py CHANGED
@@ -249,7 +249,7 @@ else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
249
 
250
  SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
251
 
252
- GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'True')
253
 
254
  DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
255
 
 
249
 
250
  SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
251
 
252
+ GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
253
 
254
  DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
255
 
tools/file_conversion.py CHANGED
@@ -1319,6 +1319,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1319
  for col in essential_box_cols:
1320
  if col not in final_df.columns:
1321
  final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
 
1322
 
1323
  base_cols = ["image"]
1324
  extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
@@ -1328,8 +1329,8 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1328
  # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
1329
  # but it's good practice if columns could be missing for other reasons.
1330
  final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
1331
-
1332
- final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"])
1333
 
1334
  return final_df
1335
 
 
1319
  for col in essential_box_cols:
1320
  if col not in final_df.columns:
1321
  final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
1322
+ final_df[col] = final_df[col].replace({None: pd.NA})
1323
 
1324
  base_cols = ["image"]
1325
  extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
 
1329
  # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
1330
  # but it's good practice if columns could be missing for other reasons.
1331
  final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
1332
+ final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how="all")
1333
+ final_df.replace({None: pd.NA})
1334
 
1335
  return final_df
1336
 
tools/file_redaction.py CHANGED
@@ -536,7 +536,11 @@ def choose_and_run_redactor(file_paths:List[str],
536
  if is_pdf(file_path) == False:
537
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
538
  # pymupdf_doc is an image list in this case
539
- img = Image.open(pymupdf_doc[-1])
 
 
 
 
540
  img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
541
  #
542
  else:
@@ -562,13 +566,14 @@ def choose_and_run_redactor(file_paths:List[str],
562
  # Convert annotations_all_pages to a consistent relative coordinate format output
563
  page_sizes = page_sizes_df.to_dict(orient="records")
564
  all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
565
- all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
566
- annotations_all_pages = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
567
 
568
- annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
 
 
 
569
 
570
  # Save the gradio_annotation_boxes to a review csv file
571
- review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
572
 
573
  # Don't need page sizes in outputs
574
  review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
@@ -625,7 +630,7 @@ def choose_and_run_redactor(file_paths:List[str],
625
  if total_textract_query_number > number_of_pages:
626
  total_textract_query_number = number_of_pages
627
 
628
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
629
 
630
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
631
  '''
@@ -1352,33 +1357,15 @@ def redact_image_pdf(file_path:str,
1352
 
1353
  # If using Tesseract
1354
  if text_extraction_method == tesseract_ocr_option:
1355
- #print("image_path:", image_path)
1356
- #print("print(type(image_path)):", print(type(image_path)))
1357
- #if not isinstance(image_path, image_path.image_path) or not isinstance(image_path, str): raise Exception("image_path object for page", reported_page_number, "not found, cannot perform local OCR analysis.")
1358
-
1359
- # Check for existing page_line_level_ocr_results_with_words object:
1360
-
1361
- # page_line_level_ocr_results = (
1362
- # all_page_line_level_ocr_results.get('results', [])
1363
- # if all_page_line_level_ocr_results.get('page') == reported_page_number
1364
- # else []
1365
- # )
1366
 
1367
  if all_page_line_level_ocr_results_with_words:
1368
  # Find the first dict where 'page' matches
1369
 
1370
- #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
1371
-
1372
- print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
1373
- #print("Looking for page:", reported_page_number)
1374
-
1375
  matching_page = next(
1376
  (item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
1377
  None
1378
  )
1379
 
1380
- #print("matching_page:", matching_page)
1381
-
1382
  page_line_level_ocr_results_with_words = matching_page if matching_page else []
1383
  else: page_line_level_ocr_results_with_words = []
1384
 
@@ -1388,12 +1375,9 @@ def redact_image_pdf(file_path:str,
1388
  else:
1389
  page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
1390
 
1391
- print("page_word_level_ocr_results:", page_word_level_ocr_results)
1392
  page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
1393
 
1394
  all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
1395
-
1396
- print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
1397
 
1398
  # Check if page exists in existing textract data. If not, send to service to analyse
1399
  if text_extraction_method == textract_option:
@@ -1471,7 +1455,6 @@ def redact_image_pdf(file_path:str,
1471
 
1472
  all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
1473
 
1474
-
1475
  if pii_identification_method != no_redaction_option:
1476
  # Step 2: Analyse text and identify PII
1477
  if chosen_redact_entities or chosen_redact_comprehend_entities:
@@ -1486,7 +1469,7 @@ def redact_image_pdf(file_path:str,
1486
  entities=chosen_redact_entities,
1487
  allow_list=allow_list,
1488
  score_threshold=score_threshold
1489
- )
1490
 
1491
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1492
 
@@ -1519,20 +1502,20 @@ def redact_image_pdf(file_path:str,
1519
  # Assume image_path is an image
1520
  image = image_path
1521
 
1522
-
1523
  fill = (0, 0, 0) # Fill colour for redactions
1524
  draw = ImageDraw.Draw(image)
1525
 
1526
  all_image_annotations_boxes = []
1527
 
1528
  for box in page_merged_redaction_bboxes:
 
1529
  try:
1530
  x0 = box.left
1531
  y0 = box.top
1532
  x1 = x0 + box.width
1533
  y1 = y0 + box.height
1534
-
1535
  label = box.entity_type # Attempt to get the label
 
1536
  except AttributeError as e:
1537
  print(f"Error accessing box attributes: {e}")
1538
  label = "Redaction" # Default label if there's an error
@@ -1542,15 +1525,19 @@ def redact_image_pdf(file_path:str,
1542
  print(f"Invalid coordinates for box: {box}")
1543
  continue # Skip this box if coordinates are invalid
1544
 
1545
- # Directly append the dictionary with the required keys
1546
- all_image_annotations_boxes.append({
1547
  "xmin": x0,
1548
  "ymin": y0,
1549
  "xmax": x1,
1550
  "ymax": y1,
1551
  "label": label,
1552
- "color": (0, 0, 0)
1553
- })
 
 
 
 
 
1554
 
1555
  # Draw the rectangle
1556
  try:
@@ -1558,7 +1545,13 @@ def redact_image_pdf(file_path:str,
1558
  except Exception as e:
1559
  print(f"Error drawing rectangle: {e}")
1560
 
1561
- page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
 
 
 
 
 
 
1562
 
1563
  # Convert decision process to table
1564
  decision_process_table = pd.DataFrame([{
@@ -1577,7 +1570,7 @@ def redact_image_pdf(file_path:str,
1577
  all_pages_decision_process_table_list.append(decision_process_table)
1578
 
1579
  decision_process_table = fill_missing_ids(decision_process_table)
1580
- #decision_process_table.to_csv("output/decision_process_table_with_ids.csv")
1581
 
1582
  toc = time.perf_counter()
1583
 
@@ -1591,7 +1584,7 @@ def redact_image_pdf(file_path:str,
1591
  tqdm._instances.clear()
1592
 
1593
  if is_pdf(file_path) == False:
1594
- pdf_image_file_paths.append(image_path)
1595
  pymupdf_doc = pdf_image_file_paths
1596
 
1597
  # Check if the image_path already exists in annotations_all_pages
@@ -1604,7 +1597,6 @@ def redact_image_pdf(file_path:str,
1604
  annotations_all_pages.append(page_image_annotations)
1605
 
1606
 
1607
-
1608
  if text_extraction_method == textract_option:
1609
  if original_textract_data != textract_data:
1610
  # Write the updated existing textract data back to the JSON file
@@ -1626,13 +1618,14 @@ def redact_image_pdf(file_path:str,
1626
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1627
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1628
 
 
1629
  current_loop_page += 1
1630
 
1631
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1632
 
1633
  # If it's an image file
1634
  if is_pdf(file_path) == False:
1635
- pdf_image_file_paths.append(image_path)
1636
  pymupdf_doc = pdf_image_file_paths
1637
 
1638
  # Check if the image_path already exists in annotations_all_pages
 
536
  if is_pdf(file_path) == False:
537
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
538
  # pymupdf_doc is an image list in this case
539
+ if isinstance(pymupdf_doc[-1], str):
540
+ img = Image.open(pymupdf_doc[-1])
541
+ # Otherwise could be an image object
542
+ else:
543
+ img = pymupdf_doc[-1]
544
  img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
545
  #
546
  else:
 
566
  # Convert annotations_all_pages to a consistent relative coordinate format output
567
  page_sizes = page_sizes_df.to_dict(orient="records")
568
  all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
 
 
569
 
570
+ all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
571
+
572
+ annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
573
+ annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
574
 
575
  # Save the gradio_annotation_boxes to a review csv file
576
+ review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
577
 
578
  # Don't need page sizes in outputs
579
  review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
 
630
  if total_textract_query_number > number_of_pages:
631
  total_textract_query_number = number_of_pages
632
 
633
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
634
 
635
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
636
  '''
 
1357
 
1358
  # If using Tesseract
1359
  if text_extraction_method == tesseract_ocr_option:
 
 
 
 
 
 
 
 
 
 
 
1360
 
1361
  if all_page_line_level_ocr_results_with_words:
1362
  # Find the first dict where 'page' matches
1363
 
 
 
 
 
 
1364
  matching_page = next(
1365
  (item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
1366
  None
1367
  )
1368
 
 
 
1369
  page_line_level_ocr_results_with_words = matching_page if matching_page else []
1370
  else: page_line_level_ocr_results_with_words = []
1371
 
 
1375
  else:
1376
  page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
1377
 
 
1378
  page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
1379
 
1380
  all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
 
 
1381
 
1382
  # Check if page exists in existing textract data. If not, send to service to analyse
1383
  if text_extraction_method == textract_option:
 
1455
 
1456
  all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
1457
 
 
1458
  if pii_identification_method != no_redaction_option:
1459
  # Step 2: Analyse text and identify PII
1460
  if chosen_redact_entities or chosen_redact_comprehend_entities:
 
1469
  entities=chosen_redact_entities,
1470
  allow_list=allow_list,
1471
  score_threshold=score_threshold
1472
+ )
1473
 
1474
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1475
 
 
1502
  # Assume image_path is an image
1503
  image = image_path
1504
 
 
1505
  fill = (0, 0, 0) # Fill colour for redactions
1506
  draw = ImageDraw.Draw(image)
1507
 
1508
  all_image_annotations_boxes = []
1509
 
1510
  for box in page_merged_redaction_bboxes:
1511
+
1512
  try:
1513
  x0 = box.left
1514
  y0 = box.top
1515
  x1 = x0 + box.width
1516
  y1 = y0 + box.height
 
1517
  label = box.entity_type # Attempt to get the label
1518
+ text = box.text
1519
  except AttributeError as e:
1520
  print(f"Error accessing box attributes: {e}")
1521
  label = "Redaction" # Default label if there's an error
 
1525
  print(f"Invalid coordinates for box: {box}")
1526
  continue # Skip this box if coordinates are invalid
1527
 
1528
+ img_annotation_box = {
 
1529
  "xmin": x0,
1530
  "ymin": y0,
1531
  "xmax": x1,
1532
  "ymax": y1,
1533
  "label": label,
1534
+ "color": (0, 0, 0),
1535
+ "text": text
1536
+ }
1537
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
1538
+
1539
+ # Directly append the dictionary with the required keys
1540
+ all_image_annotations_boxes.append(img_annotation_box)
1541
 
1542
  # Draw the rectangle
1543
  try:
 
1545
  except Exception as e:
1546
  print(f"Error drawing rectangle: {e}")
1547
 
1548
+ page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
1549
+
1550
+ print("page_image_annotations at box drawing:", page_image_annotations)
1551
+
1552
+ redacted_image = image.copy()
1553
+ #redacted_image.save("test_out_image.png")
1554
+
1555
 
1556
  # Convert decision process to table
1557
  decision_process_table = pd.DataFrame([{
 
1570
  all_pages_decision_process_table_list.append(decision_process_table)
1571
 
1572
  decision_process_table = fill_missing_ids(decision_process_table)
1573
+ decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
1574
 
1575
  toc = time.perf_counter()
1576
 
 
1584
  tqdm._instances.clear()
1585
 
1586
  if is_pdf(file_path) == False:
1587
+ pdf_image_file_paths.append(redacted_image) # .append(image_path)
1588
  pymupdf_doc = pdf_image_file_paths
1589
 
1590
  # Check if the image_path already exists in annotations_all_pages
 
1597
  annotations_all_pages.append(page_image_annotations)
1598
 
1599
 
 
1600
  if text_extraction_method == textract_option:
1601
  if original_textract_data != textract_data:
1602
  # Write the updated existing textract data back to the JSON file
 
1618
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1619
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1620
 
1621
+
1622
  current_loop_page += 1
1623
 
1624
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1625
 
1626
  # If it's an image file
1627
  if is_pdf(file_path) == False:
1628
+ pdf_image_file_paths.append(redacted_image)#.append(image_path)
1629
  pymupdf_doc = pdf_image_file_paths
1630
 
1631
  # Check if the image_path already exists in annotations_all_pages
tools/redaction_review.py CHANGED
@@ -276,7 +276,7 @@ def update_annotator_page_from_review_df(
276
 
277
  match = re.search(r"(\d+)\.png$", page_state_entry['image'])
278
  if match: page_no = int(match.group(1))
279
- else: page_no = -1
280
 
281
  if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
282
  # Replace the annotations list for this page with the new list from review_df
 
276
 
277
  match = re.search(r"(\d+)\.png$", page_state_entry['image'])
278
  if match: page_no = int(match.group(1))
279
+ else: page_no = 0
280
 
281
  if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
282
  # Replace the annotations list for this page with the new list from review_df