Commit
·
36f8e9f
1
Parent(s):
47a3a80
Fix for image file redaction
Browse files- DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec +1 -1
- pyproject.toml +1 -1
- tools/config.py +1 -1
- tools/file_conversion.py +3 -2
- tools/file_redaction.py +33 -40
- tools/redaction_review.py +1 -1
DocRedactApp_0.4.0.spec → DocRedactApp_0.6.1.spec
RENAMED
@@ -62,5 +62,5 @@ coll = COLLECT(
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
-
name='DocRedactApp_0.
|
66 |
)
|
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
+
name='DocRedactApp_0.6.1',
|
66 |
)
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction" # Your application's name
|
7 |
-
version = "0.6.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
|
9 |
readme = "README.md" # Path to your project's README file
|
10 |
requires-python = ">=3.10" # The minimum Python version required
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction" # Your application's name
|
7 |
+
version = "0.6.1" # Your application's current version
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
|
9 |
readme = "README.md" # Path to your project's README file
|
10 |
requires-python = ">=3.10" # The minimum Python version required
|
tools/config.py
CHANGED
@@ -249,7 +249,7 @@ else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
|
249 |
|
250 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
251 |
|
252 |
-
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', '
|
253 |
|
254 |
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
|
255 |
|
|
|
249 |
|
250 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
251 |
|
252 |
+
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
|
253 |
|
254 |
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
|
255 |
|
tools/file_conversion.py
CHANGED
@@ -1319,6 +1319,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1319 |
for col in essential_box_cols:
|
1320 |
if col not in final_df.columns:
|
1321 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
|
|
1322 |
|
1323 |
base_cols = ["image"]
|
1324 |
extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
|
@@ -1328,8 +1329,8 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1328 |
# Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
|
1329 |
# but it's good practice if columns could be missing for other reasons.
|
1330 |
final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
|
1331 |
-
|
1332 |
-
final_df
|
1333 |
|
1334 |
return final_df
|
1335 |
|
|
|
1319 |
for col in essential_box_cols:
|
1320 |
if col not in final_df.columns:
|
1321 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
1322 |
+
final_df[col] = final_df[col].replace({None: pd.NA})
|
1323 |
|
1324 |
base_cols = ["image"]
|
1325 |
extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
|
|
|
1329 |
# Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
|
1330 |
# but it's good practice if columns could be missing for other reasons.
|
1331 |
final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
|
1332 |
+
final_df = final_df.dropna(subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how="all")
|
1333 |
+
final_df.replace({None: pd.NA})
|
1334 |
|
1335 |
return final_df
|
1336 |
|
tools/file_redaction.py
CHANGED
@@ -536,7 +536,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
536 |
if is_pdf(file_path) == False:
|
537 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
|
538 |
# pymupdf_doc is an image list in this case
|
539 |
-
|
|
|
|
|
|
|
|
|
540 |
img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
|
541 |
#
|
542 |
else:
|
@@ -562,13 +566,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
562 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
563 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
564 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
565 |
-
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
566 |
-
annotations_all_pages = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
567 |
|
568 |
-
|
|
|
|
|
|
|
569 |
|
570 |
# Save the gradio_annotation_boxes to a review csv file
|
571 |
-
review_file_state = convert_annotation_json_to_review_df(
|
572 |
|
573 |
# Don't need page sizes in outputs
|
574 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
@@ -625,7 +630,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
625 |
if total_textract_query_number > number_of_pages:
|
626 |
total_textract_query_number = number_of_pages
|
627 |
|
628 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc,
|
629 |
|
630 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
631 |
'''
|
@@ -1352,33 +1357,15 @@ def redact_image_pdf(file_path:str,
|
|
1352 |
|
1353 |
# If using Tesseract
|
1354 |
if text_extraction_method == tesseract_ocr_option:
|
1355 |
-
#print("image_path:", image_path)
|
1356 |
-
#print("print(type(image_path)):", print(type(image_path)))
|
1357 |
-
#if not isinstance(image_path, image_path.image_path) or not isinstance(image_path, str): raise Exception("image_path object for page", reported_page_number, "not found, cannot perform local OCR analysis.")
|
1358 |
-
|
1359 |
-
# Check for existing page_line_level_ocr_results_with_words object:
|
1360 |
-
|
1361 |
-
# page_line_level_ocr_results = (
|
1362 |
-
# all_page_line_level_ocr_results.get('results', [])
|
1363 |
-
# if all_page_line_level_ocr_results.get('page') == reported_page_number
|
1364 |
-
# else []
|
1365 |
-
# )
|
1366 |
|
1367 |
if all_page_line_level_ocr_results_with_words:
|
1368 |
# Find the first dict where 'page' matches
|
1369 |
|
1370 |
-
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
1371 |
-
|
1372 |
-
print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
|
1373 |
-
#print("Looking for page:", reported_page_number)
|
1374 |
-
|
1375 |
matching_page = next(
|
1376 |
(item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
|
1377 |
None
|
1378 |
)
|
1379 |
|
1380 |
-
#print("matching_page:", matching_page)
|
1381 |
-
|
1382 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1383 |
else: page_line_level_ocr_results_with_words = []
|
1384 |
|
@@ -1388,12 +1375,9 @@ def redact_image_pdf(file_path:str,
|
|
1388 |
else:
|
1389 |
page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
|
1390 |
|
1391 |
-
print("page_word_level_ocr_results:", page_word_level_ocr_results)
|
1392 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
1393 |
|
1394 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1395 |
-
|
1396 |
-
print("All pages available:", [item.get('page') for item in all_page_line_level_ocr_results_with_words])
|
1397 |
|
1398 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1399 |
if text_extraction_method == textract_option:
|
@@ -1471,7 +1455,6 @@ def redact_image_pdf(file_path:str,
|
|
1471 |
|
1472 |
all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
|
1473 |
|
1474 |
-
|
1475 |
if pii_identification_method != no_redaction_option:
|
1476 |
# Step 2: Analyse text and identify PII
|
1477 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
@@ -1486,7 +1469,7 @@ def redact_image_pdf(file_path:str,
|
|
1486 |
entities=chosen_redact_entities,
|
1487 |
allow_list=allow_list,
|
1488 |
score_threshold=score_threshold
|
1489 |
-
)
|
1490 |
|
1491 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1492 |
|
@@ -1519,20 +1502,20 @@ def redact_image_pdf(file_path:str,
|
|
1519 |
# Assume image_path is an image
|
1520 |
image = image_path
|
1521 |
|
1522 |
-
|
1523 |
fill = (0, 0, 0) # Fill colour for redactions
|
1524 |
draw = ImageDraw.Draw(image)
|
1525 |
|
1526 |
all_image_annotations_boxes = []
|
1527 |
|
1528 |
for box in page_merged_redaction_bboxes:
|
|
|
1529 |
try:
|
1530 |
x0 = box.left
|
1531 |
y0 = box.top
|
1532 |
x1 = x0 + box.width
|
1533 |
y1 = y0 + box.height
|
1534 |
-
|
1535 |
label = box.entity_type # Attempt to get the label
|
|
|
1536 |
except AttributeError as e:
|
1537 |
print(f"Error accessing box attributes: {e}")
|
1538 |
label = "Redaction" # Default label if there's an error
|
@@ -1542,15 +1525,19 @@ def redact_image_pdf(file_path:str,
|
|
1542 |
print(f"Invalid coordinates for box: {box}")
|
1543 |
continue # Skip this box if coordinates are invalid
|
1544 |
|
1545 |
-
|
1546 |
-
all_image_annotations_boxes.append({
|
1547 |
"xmin": x0,
|
1548 |
"ymin": y0,
|
1549 |
"xmax": x1,
|
1550 |
"ymax": y1,
|
1551 |
"label": label,
|
1552 |
-
"color": (0, 0, 0)
|
1553 |
-
|
|
|
|
|
|
|
|
|
|
|
1554 |
|
1555 |
# Draw the rectangle
|
1556 |
try:
|
@@ -1558,7 +1545,13 @@ def redact_image_pdf(file_path:str,
|
|
1558 |
except Exception as e:
|
1559 |
print(f"Error drawing rectangle: {e}")
|
1560 |
|
1561 |
-
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
|
|
|
|
|
|
|
|
|
|
|
|
1562 |
|
1563 |
# Convert decision process to table
|
1564 |
decision_process_table = pd.DataFrame([{
|
@@ -1577,7 +1570,7 @@ def redact_image_pdf(file_path:str,
|
|
1577 |
all_pages_decision_process_table_list.append(decision_process_table)
|
1578 |
|
1579 |
decision_process_table = fill_missing_ids(decision_process_table)
|
1580 |
-
|
1581 |
|
1582 |
toc = time.perf_counter()
|
1583 |
|
@@ -1591,7 +1584,7 @@ def redact_image_pdf(file_path:str,
|
|
1591 |
tqdm._instances.clear()
|
1592 |
|
1593 |
if is_pdf(file_path) == False:
|
1594 |
-
pdf_image_file_paths.append(image_path)
|
1595 |
pymupdf_doc = pdf_image_file_paths
|
1596 |
|
1597 |
# Check if the image_path already exists in annotations_all_pages
|
@@ -1604,7 +1597,6 @@ def redact_image_pdf(file_path:str,
|
|
1604 |
annotations_all_pages.append(page_image_annotations)
|
1605 |
|
1606 |
|
1607 |
-
|
1608 |
if text_extraction_method == textract_option:
|
1609 |
if original_textract_data != textract_data:
|
1610 |
# Write the updated existing textract data back to the JSON file
|
@@ -1626,13 +1618,14 @@ def redact_image_pdf(file_path:str,
|
|
1626 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
1627 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
1628 |
|
|
|
1629 |
current_loop_page += 1
|
1630 |
|
1631 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1632 |
|
1633 |
# If it's an image file
|
1634 |
if is_pdf(file_path) == False:
|
1635 |
-
pdf_image_file_paths.append(image_path)
|
1636 |
pymupdf_doc = pdf_image_file_paths
|
1637 |
|
1638 |
# Check if the image_path already exists in annotations_all_pages
|
|
|
536 |
if is_pdf(file_path) == False:
|
537 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
|
538 |
# pymupdf_doc is an image list in this case
|
539 |
+
if isinstance(pymupdf_doc[-1], str):
|
540 |
+
img = Image.open(pymupdf_doc[-1])
|
541 |
+
# Otherwise could be an image object
|
542 |
+
else:
|
543 |
+
img = pymupdf_doc[-1]
|
544 |
img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
|
545 |
#
|
546 |
else:
|
|
|
566 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
567 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
568 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
|
|
|
|
569 |
|
570 |
+
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
571 |
+
|
572 |
+
annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
573 |
+
annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
|
574 |
|
575 |
# Save the gradio_annotation_boxes to a review csv file
|
576 |
+
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
|
577 |
|
578 |
# Don't need page sizes in outputs
|
579 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
|
|
630 |
if total_textract_query_number > number_of_pages:
|
631 |
total_textract_query_number = number_of_pages
|
632 |
|
633 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
634 |
|
635 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
636 |
'''
|
|
|
1357 |
|
1358 |
# If using Tesseract
|
1359 |
if text_extraction_method == tesseract_ocr_option:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1360 |
|
1361 |
if all_page_line_level_ocr_results_with_words:
|
1362 |
# Find the first dict where 'page' matches
|
1363 |
|
|
|
|
|
|
|
|
|
|
|
1364 |
matching_page = next(
|
1365 |
(item for item in all_page_line_level_ocr_results_with_words if int(item.get('page', -1)) == int(reported_page_number)),
|
1366 |
None
|
1367 |
)
|
1368 |
|
|
|
|
|
1369 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1370 |
else: page_line_level_ocr_results_with_words = []
|
1371 |
|
|
|
1375 |
else:
|
1376 |
page_word_level_ocr_results = image_analyser.perform_ocr(image_path)
|
1377 |
|
|
|
1378 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
1379 |
|
1380 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
|
|
|
|
1381 |
|
1382 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1383 |
if text_extraction_method == textract_option:
|
|
|
1455 |
|
1456 |
all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
|
1457 |
|
|
|
1458 |
if pii_identification_method != no_redaction_option:
|
1459 |
# Step 2: Analyse text and identify PII
|
1460 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
|
|
1469 |
entities=chosen_redact_entities,
|
1470 |
allow_list=allow_list,
|
1471 |
score_threshold=score_threshold
|
1472 |
+
)
|
1473 |
|
1474 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1475 |
|
|
|
1502 |
# Assume image_path is an image
|
1503 |
image = image_path
|
1504 |
|
|
|
1505 |
fill = (0, 0, 0) # Fill colour for redactions
|
1506 |
draw = ImageDraw.Draw(image)
|
1507 |
|
1508 |
all_image_annotations_boxes = []
|
1509 |
|
1510 |
for box in page_merged_redaction_bboxes:
|
1511 |
+
|
1512 |
try:
|
1513 |
x0 = box.left
|
1514 |
y0 = box.top
|
1515 |
x1 = x0 + box.width
|
1516 |
y1 = y0 + box.height
|
|
|
1517 |
label = box.entity_type # Attempt to get the label
|
1518 |
+
text = box.text
|
1519 |
except AttributeError as e:
|
1520 |
print(f"Error accessing box attributes: {e}")
|
1521 |
label = "Redaction" # Default label if there's an error
|
|
|
1525 |
print(f"Invalid coordinates for box: {box}")
|
1526 |
continue # Skip this box if coordinates are invalid
|
1527 |
|
1528 |
+
img_annotation_box = {
|
|
|
1529 |
"xmin": x0,
|
1530 |
"ymin": y0,
|
1531 |
"xmax": x1,
|
1532 |
"ymax": y1,
|
1533 |
"label": label,
|
1534 |
+
"color": (0, 0, 0),
|
1535 |
+
"text": text
|
1536 |
+
}
|
1537 |
+
img_annotation_box = fill_missing_box_ids(img_annotation_box)
|
1538 |
+
|
1539 |
+
# Directly append the dictionary with the required keys
|
1540 |
+
all_image_annotations_boxes.append(img_annotation_box)
|
1541 |
|
1542 |
# Draw the rectangle
|
1543 |
try:
|
|
|
1545 |
except Exception as e:
|
1546 |
print(f"Error drawing rectangle: {e}")
|
1547 |
|
1548 |
+
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1549 |
+
|
1550 |
+
print("page_image_annotations at box drawing:", page_image_annotations)
|
1551 |
+
|
1552 |
+
redacted_image = image.copy()
|
1553 |
+
#redacted_image.save("test_out_image.png")
|
1554 |
+
|
1555 |
|
1556 |
# Convert decision process to table
|
1557 |
decision_process_table = pd.DataFrame([{
|
|
|
1570 |
all_pages_decision_process_table_list.append(decision_process_table)
|
1571 |
|
1572 |
decision_process_table = fill_missing_ids(decision_process_table)
|
1573 |
+
decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
|
1574 |
|
1575 |
toc = time.perf_counter()
|
1576 |
|
|
|
1584 |
tqdm._instances.clear()
|
1585 |
|
1586 |
if is_pdf(file_path) == False:
|
1587 |
+
pdf_image_file_paths.append(redacted_image) # .append(image_path)
|
1588 |
pymupdf_doc = pdf_image_file_paths
|
1589 |
|
1590 |
# Check if the image_path already exists in annotations_all_pages
|
|
|
1597 |
annotations_all_pages.append(page_image_annotations)
|
1598 |
|
1599 |
|
|
|
1600 |
if text_extraction_method == textract_option:
|
1601 |
if original_textract_data != textract_data:
|
1602 |
# Write the updated existing textract data back to the JSON file
|
|
|
1618 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
1619 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
1620 |
|
1621 |
+
|
1622 |
current_loop_page += 1
|
1623 |
|
1624 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1625 |
|
1626 |
# If it's an image file
|
1627 |
if is_pdf(file_path) == False:
|
1628 |
+
pdf_image_file_paths.append(redacted_image)#.append(image_path)
|
1629 |
pymupdf_doc = pdf_image_file_paths
|
1630 |
|
1631 |
# Check if the image_path already exists in annotations_all_pages
|
tools/redaction_review.py
CHANGED
@@ -276,7 +276,7 @@ def update_annotator_page_from_review_df(
|
|
276 |
|
277 |
match = re.search(r"(\d+)\.png$", page_state_entry['image'])
|
278 |
if match: page_no = int(match.group(1))
|
279 |
-
else: page_no =
|
280 |
|
281 |
if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
|
282 |
# Replace the annotations list for this page with the new list from review_df
|
|
|
276 |
|
277 |
match = re.search(r"(\d+)\.png$", page_state_entry['image'])
|
278 |
if match: page_no = int(match.group(1))
|
279 |
+
else: page_no = 0
|
280 |
|
281 |
if 'image' in page_state_entry and page_no == page_num_reported_zero_indexed:
|
282 |
# Replace the annotations list for this page with the new list from review_df
|