Merge pull request #19 from seanpedrick-case/dev
Browse files- DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec +1 -1
- pyproject.toml +1 -1
- tools/config.py +1 -1
- tools/file_conversion.py +3 -2
- tools/file_redaction.py +33 -40
- tools/redaction_review.py +1 -1
DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec
RENAMED
|
@@ -62,5 +62,5 @@ coll = COLLECT(
|
|
| 62 |
strip=False,
|
| 63 |
upx=True,
|
| 64 |
upx_exclude=[],
|
| 65 |
-
name='DocRedactApp_0.
|
| 66 |
)
|
|
|
|
| 62 |
strip=False,
|
| 63 |
upx=True,
|
| 64 |
upx_exclude=[],
|
| 65 |
+
name='DocRedactApp_0.6.1',
|
| 66 |
)
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "doc_redaction" # Your application's name
|
| 7 |
-
version = "0.6.
|
| 8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
|
| 9 |
readme = "README.md" # Path to your project's README file
|
| 10 |
requires-python = ">=3.10" # The minimum Python version required
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "doc_redaction" # Your application's name
|
| 7 |
+
version = "0.6.1" # Your application's current version
|
| 8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
|
| 9 |
readme = "README.md" # Path to your project's README file
|
| 10 |
requires-python = ">=3.10" # The minimum Python version required
|
tools/config.py
CHANGED
|
@@ -249,7 +249,7 @@ else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
|
| 249 |
|
| 250 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
| 251 |
|
| 252 |
-
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', '
|
| 253 |
|
| 254 |
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
|
| 255 |
|
|
|
|
| 249 |
|
| 250 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
| 251 |
|
| 252 |
+
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
|
| 253 |
|
| 254 |
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
|
| 255 |
|
tools/file_conversion.py
CHANGED
|
@@ -1319,6 +1319,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
| 1319 |
for col in essential_box_cols:
|
| 1320 |
if col not in final_df.columns:
|
| 1321 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
|
|
|
| 1322 |
|
| 1323 |
base_cols = ["image"]
|
| 1324 |
extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
|
|
@@ -1328,8 +1329,8 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
| 1328 |
# Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
|
| 1329 |
# but it's good practice if columns could be missing for other reasons.
|
| 1330 |
final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
|
| 1331 |
-
|
| 1332 |
-
final_df
|
| 1333 |
|
| 1334 |
return final_df
|
| 1335 |
|
|
|
|
| 1319 |
for col in essential_box_cols:
|
| 1320 |
if col not in final_df.columns:
|
| 1321 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
| 1322 |
+
final_df[col] = final_df[col].replace({None: pd.NA})
|
| 1323 |
|
| 1324 |
base_cols = ["image"]
|
| 1325 |
extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
|
|
|
|
| 1329 |
# Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
|
| 1330 |
# but it's good practice if columns could be missing for other reasons.
|
| 1331 |
final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
|
| 1332 |
+
final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how="all")
|
| 1333 |
+
final_df.replace({None: pd.NA})
|
| 1334 |
|
| 1335 |
return final_df
|
| 1336 |
|
tools/file_redaction.py
CHANGED
|
@@ -536,7 +536,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 536 |
if is_pdf(file_path) == False:
|
| 537 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
|
| 538 |
# pymupdf_doc is an image list in this case
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
|
| 541 |
#
|
| 542 |
else:
|
|
@@ -562,13 +566,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 562 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
| 563 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
| 564 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
| 565 |
-
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
| 566 |
-
annotations_all_pages = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
| 567 |
|
| 568 |
-
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
# Save the gradio_annotation_boxes to a review csv file
|
| 571 |
-
review_file_state = convert_annotation_json_to_review_df(
|
| 572 |
|
| 573 |
# Don't need page sizes in outputs
|
| 574 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
|
@@ -625,7 +630,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 625 |
if total_textract_query_number > number_of_pages:
|
| 626 |
total_textract_query_number = number_of_pages
|
| 627 |
|
| 628 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc,
|
| 629 |
|
| 630 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
| 631 |
'''
|
|
@@ -1352,33 +1357,15 @@ def redact_image_pdf(file_path:str,
|
|
| 1352 |
|
| 1353 |
# If using Tesseract
|
| 1354 |
if text_extraction_method == tesseract_ocr_option:
|
| 1355 |
-
#print("image_path:", image_path)
|
| 1356 |
-
#print("print(type(image_path)):", print(type(image_path)))
|
| 1357 |
-
#if not isinstance(image_path, image_path.image_path) or not isinstance(image_path, str): raise Exception("image_path object for page", reported_page_number, "not found, cannot perform local OCR analysis.")
|
| 1358 |
-
|
| 1359 |
-
# Check for existing page_line_level_ocr_results_with_words object:
|
| 1360 |
-
|
| 1361 |
-
# page_line_level_ocr_results = (
|
| 1362 |
-
# all_page_line_level_ocr_results.get('results', [])
|
| 1363 |
-
# if all_page_line_level_ocr_results.get('page') == reported_page_number
|
| 1364 |
-
# else []
|
| 1365 |
-
# )
|
| 1366 |
|
| 1367 |
if all_page_line_level_ocr_results_with_words:
|
| 1368 |
# Find the first dict where 'page' matches
|
| 1369 |
|
| 1370 |
-
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
| 1371 |
-
|
| 1372 |
-
print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
|
| 1373 |
-
#print("Looking for page:", reported_page_number)
|
| 1374 |
-
|
| 1375 |
matching_page = next(
|
| 1376 |
(item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
|
| 1377 |
None
|
| 1378 |
)
|
| 1379 |
|
| 1380 |
-
#print("matching_page:", matching_page)
|
| 1381 |
-
|
| 1382 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
| 1383 |
else: page_line_level_ocr_results_with_words = []
|
| 1384 |
|
|
@@ -1388,12 +1375,9 @@ def redact_image_pdf(file_path:str,
|
|
| 1388 |
else:
|
| 1389 |
page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
|
| 1390 |
|
| 1391 |
-
print("page_word_level_ocr_results:", page_word_level_ocr_results)
|
| 1392 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
| 1393 |
|
| 1394 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
| 1395 |
-
|
| 1396 |
-
print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
|
| 1397 |
|
| 1398 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
| 1399 |
if text_extraction_method == textract_option:
|
|
@@ -1471,7 +1455,6 @@ def redact_image_pdf(file_path:str,
|
|
| 1471 |
|
| 1472 |
all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
|
| 1473 |
|
| 1474 |
-
|
| 1475 |
if pii_identification_method != no_redaction_option:
|
| 1476 |
# Step 2: Analyse text and identify PII
|
| 1477 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
|
@@ -1486,7 +1469,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1486 |
entities=chosen_redact_entities,
|
| 1487 |
allow_list=allow_list,
|
| 1488 |
score_threshold=score_threshold
|
| 1489 |
-
)
|
| 1490 |
|
| 1491 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
| 1492 |
|
|
@@ -1519,20 +1502,20 @@ def redact_image_pdf(file_path:str,
|
|
| 1519 |
# Assume image_path is an image
|
| 1520 |
image = image_path
|
| 1521 |
|
| 1522 |
-
|
| 1523 |
fill = (0, 0, 0) # Fill colour for redactions
|
| 1524 |
draw = ImageDraw.Draw(image)
|
| 1525 |
|
| 1526 |
all_image_annotations_boxes = []
|
| 1527 |
|
| 1528 |
for box in page_merged_redaction_bboxes:
|
|
|
|
| 1529 |
try:
|
| 1530 |
x0 = box.left
|
| 1531 |
y0 = box.top
|
| 1532 |
x1 = x0 + box.width
|
| 1533 |
y1 = y0 + box.height
|
| 1534 |
-
|
| 1535 |
label = box.entity_type # Attempt to get the label
|
|
|
|
| 1536 |
except AttributeError as e:
|
| 1537 |
print(f"Error accessing box attributes: {e}")
|
| 1538 |
label = "Redaction" # Default label if there's an error
|
|
@@ -1542,15 +1525,19 @@ def redact_image_pdf(file_path:str,
|
|
| 1542 |
print(f"Invalid coordinates for box: {box}")
|
| 1543 |
continue # Skip this box if coordinates are invalid
|
| 1544 |
|
| 1545 |
-
|
| 1546 |
-
all_image_annotations_boxes.append({
|
| 1547 |
"xmin": x0,
|
| 1548 |
"ymin": y0,
|
| 1549 |
"xmax": x1,
|
| 1550 |
"ymax": y1,
|
| 1551 |
"label": label,
|
| 1552 |
-
"color": (0, 0, 0)
|
| 1553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1554 |
|
| 1555 |
# Draw the rectangle
|
| 1556 |
try:
|
|
@@ -1558,7 +1545,13 @@ def redact_image_pdf(file_path:str,
|
|
| 1558 |
except Exception as e:
|
| 1559 |
print(f"Error drawing rectangle: {e}")
|
| 1560 |
|
| 1561 |
-
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1562 |
|
| 1563 |
# Convert decision process to table
|
| 1564 |
decision_process_table = pd.DataFrame([{
|
|
@@ -1577,7 +1570,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1577 |
all_pages_decision_process_table_list.append(decision_process_table)
|
| 1578 |
|
| 1579 |
decision_process_table = fill_missing_ids(decision_process_table)
|
| 1580 |
-
|
| 1581 |
|
| 1582 |
toc = time.perf_counter()
|
| 1583 |
|
|
@@ -1591,7 +1584,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1591 |
tqdm._instances.clear()
|
| 1592 |
|
| 1593 |
if is_pdf(file_path) == False:
|
| 1594 |
-
pdf_image_file_paths.append(image_path)
|
| 1595 |
pymupdf_doc = pdf_image_file_paths
|
| 1596 |
|
| 1597 |
# Check if the image_path already exists in annotations_all_pages
|
|
@@ -1604,7 +1597,6 @@ def redact_image_pdf(file_path:str,
|
|
| 1604 |
annotations_all_pages.append(page_image_annotations)
|
| 1605 |
|
| 1606 |
|
| 1607 |
-
|
| 1608 |
if text_extraction_method == textract_option:
|
| 1609 |
if original_textract_data != textract_data:
|
| 1610 |
# Write the updated existing textract data back to the JSON file
|
|
@@ -1626,13 +1618,14 @@ def redact_image_pdf(file_path:str,
|
|
| 1626 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
| 1627 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
| 1628 |
|
|
|
|
| 1629 |
current_loop_page += 1
|
| 1630 |
|
| 1631 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
| 1632 |
|
| 1633 |
# If it's an image file
|
| 1634 |
if is_pdf(file_path) == False:
|
| 1635 |
-
pdf_image_file_paths.append(image_path)
|
| 1636 |
pymupdf_doc = pdf_image_file_paths
|
| 1637 |
|
| 1638 |
# Check if the image_path already exists in annotations_all_pages
|
|
|
|
| 536 |
if is_pdf(file_path) == False:
|
| 537 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
|
| 538 |
# pymupdf_doc is an image list in this case
|
| 539 |
+
if isinstance(pymupdf_doc[-1], str):
|
| 540 |
+
img = Image.open(pymupdf_doc[-1])
|
| 541 |
+
# Otherwise could be an image object
|
| 542 |
+
else:
|
| 543 |
+
img = pymupdf_doc[-1]
|
| 544 |
img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
|
| 545 |
#
|
| 546 |
else:
|
|
|
|
| 566 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
| 567 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
| 568 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
|
|
|
|
|
|
| 569 |
|
| 570 |
+
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
| 571 |
+
|
| 572 |
+
annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
| 573 |
+
annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
|
| 574 |
|
| 575 |
# Save the gradio_annotation_boxes to a review csv file
|
| 576 |
+
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
|
| 577 |
|
| 578 |
# Don't need page sizes in outputs
|
| 579 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
|
|
|
| 630 |
if total_textract_query_number > number_of_pages:
|
| 631 |
total_textract_query_number = number_of_pages
|
| 632 |
|
| 633 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
| 634 |
|
| 635 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
| 636 |
'''
|
|
|
|
| 1357 |
|
| 1358 |
# If using Tesseract
|
| 1359 |
if text_extraction_method == tesseract_ocr_option:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
|
| 1361 |
if all_page_line_level_ocr_results_with_words:
|
| 1362 |
# Find the first dict where 'page' matches
|
| 1363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1364 |
matching_page = next(
|
| 1365 |
(item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
|
| 1366 |
None
|
| 1367 |
)
|
| 1368 |
|
|
|
|
|
|
|
| 1369 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
| 1370 |
else: page_line_level_ocr_results_with_words = []
|
| 1371 |
|
|
|
|
| 1375 |
else:
|
| 1376 |
page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
|
| 1377 |
|
|
|
|
| 1378 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
| 1379 |
|
| 1380 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
|
|
|
|
|
|
| 1381 |
|
| 1382 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
| 1383 |
if text_extraction_method == textract_option:
|
|
|
|
| 1455 |
|
| 1456 |
all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
|
| 1457 |
|
|
|
|
| 1458 |
if pii_identification_method != no_redaction_option:
|
| 1459 |
# Step 2: Analyse text and identify PII
|
| 1460 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
|
|
|
| 1469 |
entities=chosen_redact_entities,
|
| 1470 |
allow_list=allow_list,
|
| 1471 |
score_threshold=score_threshold
|
| 1472 |
+
)
|
| 1473 |
|
| 1474 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
| 1475 |
|
|
|
|
| 1502 |
# Assume image_path is an image
|
| 1503 |
image = image_path
|
| 1504 |
|
|
|
|
| 1505 |
fill = (0, 0, 0) # Fill colour for redactions
|
| 1506 |
draw = ImageDraw.Draw(image)
|
| 1507 |
|
| 1508 |
all_image_annotations_boxes = []
|
| 1509 |
|
| 1510 |
for box in page_merged_redaction_bboxes:
|
| 1511 |
+
|
| 1512 |
try:
|
| 1513 |
x0 = box.left
|
| 1514 |
y0 = box.top
|
| 1515 |
x1 = x0 + box.width
|
| 1516 |
y1 = y0 + box.height
|
|
|
|
| 1517 |
label = box.entity_type # Attempt to get the label
|
| 1518 |
+
text = box.text
|
| 1519 |
except AttributeError as e:
|
| 1520 |
print(f"Error accessing box attributes: {e}")
|
| 1521 |
label = "Redaction" # Default label if there's an error
|
|
|
|
| 1525 |
print(f"Invalid coordinates for box: {box}")
|
| 1526 |
continue # Skip this box if coordinates are invalid
|
| 1527 |
|
| 1528 |
+
img_annotation_box = {
|
|
|
|
| 1529 |
"xmin": x0,
|
| 1530 |
"ymin": y0,
|
| 1531 |
"xmax": x1,
|
| 1532 |
"ymax": y1,
|
| 1533 |
"label": label,
|
| 1534 |
+
"color": (0, 0, 0),
|
| 1535 |
+
"text": text
|
| 1536 |
+
}
|
| 1537 |
+
img_annotation_box = fill_missing_box_ids(img_annotation_box)
|
| 1538 |
+
|
| 1539 |
+
# Directly append the dictionary with the required keys
|
| 1540 |
+
all_image_annotations_boxes.append(img_annotation_box)
|
| 1541 |
|
| 1542 |
# Draw the rectangle
|
| 1543 |
try:
|
|
|
|
| 1545 |
except Exception as e:
|
| 1546 |
print(f"Error drawing rectangle: {e}")
|
| 1547 |
|
| 1548 |
+
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
| 1549 |
+
|
| 1550 |
+
print("page_image_annotations at box drawing:", page_image_annotations)
|
| 1551 |
+
|
| 1552 |
+
redacted_image = image.copy()
|
| 1553 |
+
#redacted_image.save("test_out_image.png")
|
| 1554 |
+
|
| 1555 |
|
| 1556 |
# Convert decision process to table
|
| 1557 |
decision_process_table = pd.DataFrame([{
|
|
|
|
| 1570 |
all_pages_decision_process_table_list.append(decision_process_table)
|
| 1571 |
|
| 1572 |
decision_process_table = fill_missing_ids(decision_process_table)
|
| 1573 |
+
decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
|
| 1574 |
|
| 1575 |
toc = time.perf_counter()
|
| 1576 |
|
|
|
|
| 1584 |
tqdm._instances.clear()
|
| 1585 |
|
| 1586 |
if is_pdf(file_path) == False:
|
| 1587 |
+
pdf_image_file_paths.append(redacted_image) # .append(image_path)
|
| 1588 |
pymupdf_doc = pdf_image_file_paths
|
| 1589 |
|
| 1590 |
# Check if the image_path already exists in annotations_all_pages
|
|
|
|
| 1597 |
annotations_all_pages.append(page_image_annotations)
|
| 1598 |
|
| 1599 |
|
|
|
|
| 1600 |
if text_extraction_method == textract_option:
|
| 1601 |
if original_textract_data != textract_data:
|
| 1602 |
# Write the updated existing textract data back to the JSON file
|
|
|
|
| 1618 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
| 1619 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
| 1620 |
|
| 1621 |
+
|
| 1622 |
current_loop_page += 1
|
| 1623 |
|
| 1624 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
| 1625 |
|
| 1626 |
# If it's an image file
|
| 1627 |
if is_pdf(file_path) == False:
|
| 1628 |
+
pdf_image_file_paths.append(redacted_image)#.append(image_path)
|
| 1629 |
pymupdf_doc = pdf_image_file_paths
|
| 1630 |
|
| 1631 |
# Check if the image_path already exists in annotations_all_pages
|
tools/redaction_review.py
CHANGED
|
@@ -276,7 +276,7 @@ def update_annotator_page_from_review_df(
|
|
| 276 |
|
| 277 |
match = re.search(r"(\d+)\.png$", page_state_entry['image'])
|
| 278 |
if match: page_no = int(match.group(1))
|
| 279 |
-
else: page_no =
|
| 280 |
|
| 281 |
if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
|
| 282 |
# Replace the annotations list for this page with the new list from review_df
|
|
|
|
| 276 |
|
| 277 |
match = re.search(r"(\d+)\.png$", page_state_entry['image'])
|
| 278 |
if match: page_no = int(match.group(1))
|
| 279 |
+
else: page_no = 0
|
| 280 |
|
| 281 |
if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
|
| 282 |
# Replace the annotations list for this page with the new list from review_df
|