seanpedrickcase commited on
Commit
143e2cc
·
1 Parent(s): c3a8cd7

App should now resize images that are too large before sending to Textract. Textract now more robust to failure. Improved reliability of json conversion to review dataframe

Browse files
app.py CHANGED
@@ -321,7 +321,7 @@ with app:
321
  ###
322
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
323
 
324
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
325
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
326
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
327
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
 
321
  ###
322
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
323
 
324
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
325
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
326
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
327
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
tools/custom_image_analyser_engine.py CHANGED
@@ -637,9 +637,9 @@ class CustomImageAnalyzerEngine:
637
  result_reset_pos.start = 0
638
  result_reset_pos.end = len(relevant_text)
639
 
640
- print("result_reset_pos:", result_reset_pos)
641
- print("relevant_line_ocr_result:", relevant_line_ocr_result)
642
- print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
643
 
644
  # Map the analyzer results to bounding boxes for this line
645
  line_results = self.map_analyzer_results_to_bounding_boxes(
 
637
  result_reset_pos.start = 0
638
  result_reset_pos.end = len(relevant_text)
639
 
640
+ #print("result_reset_pos:", result_reset_pos)
641
+ #print("relevant_line_ocr_result:", relevant_line_ocr_result)
642
+ #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
643
 
644
  # Map the analyzer results to bounding boxes for this line
645
  line_results = self.map_analyzer_results_to_bounding_boxes(
tools/file_conversion.py CHANGED
@@ -51,26 +51,40 @@ def is_pdf(filename):
51
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
52
  print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
53
 
 
 
 
 
54
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
55
  try:
56
- # Construct the full output directory path relative to the current working directory
57
  output_dir = os.path.join(os.getcwd(), output_dir)
58
-
59
- # Use the output_dir to construct the out_path
60
  out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
61
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
62
-
63
  if os.path.exists(out_path):
64
- #print(f"Loading existing image for page {page_num + 1}")
65
  image = Image.open(out_path)
66
  else:
67
- #print(f"Converting page {page_num + 1}")
68
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
69
  dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
70
  image = image_l[0]
71
  image = image.convert("L")
72
  image.save(out_path, format="PNG")
 
 
 
 
 
 
 
 
 
 
 
73
  return page_num, out_path
 
74
  except Exception as e:
75
  print(f"Error processing page {page_num + 1}: {e}")
76
  return page_num, None
@@ -683,14 +697,20 @@ def join_values_within_threshold(df1, df2):
683
  print(final_df)
684
 
685
 
686
- def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
 
 
 
687
  # Flatten the data
688
- flattened_data = []
689
 
690
- for entry in data:
691
- #print("entry:", entry)
 
 
 
692
  #print("flattened_data:", flattened_data)
693
- image_path = entry["image"]
694
 
695
  # Use regex to find the number before .png
696
  match = re.search(r'_(\d+)\.png$', image_path)
@@ -701,56 +721,66 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
701
  else:
702
  print("No number found before .png")
703
 
704
- # Check if 'boxes' is in the entry, if not, add an empty list
705
- if 'boxes' not in entry:
706
- entry['boxes'] = []
707
 
708
- for box in entry["boxes"]:
709
  if 'text' not in box:
710
- data_to_add = {"image": image_path, "page": reported_number, **box} # "text": entry['text'],
711
  else:
712
- data_to_add = {"image": image_path, "page": reported_number, "text": entry['text'], **box}
713
  #print("data_to_add:", data_to_add)
714
- flattened_data.append(data_to_add)
715
 
716
  # Convert to a DataFrame
717
- df = pd.DataFrame(flattened_data)
 
 
 
718
 
719
  # Join on additional text data from decision output results if included
720
- if not text_join_data.empty:
721
- #print("text_join_data:", text_join_data)
722
- #print("df:", df)
723
- text_join_data['page'] = text_join_data['page'].astype(str)
724
- df['page'] = df['page'].astype(str)
725
- text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
 
 
726
  # Round to the closest number divisible by 5
727
- text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = (text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
728
- text_join_data = text_join_data.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
 
729
 
730
- df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
 
 
731
 
732
- df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
733
 
734
- df = df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
735
 
736
- df = df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
737
 
738
- if 'text' not in df.columns:
739
- df['text'] = ''
 
 
740
 
741
- df = df.sort_values(['page', 'ymin', 'xmin', 'label'])
742
 
743
- return df
744
 
745
- def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
746
  '''
747
  Convert a review csv to a json file for use by the Gradio Annotation object
748
  '''
749
  # Keep only necessary columns
750
- df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
751
 
752
  # Group the DataFrame by the 'image' column
753
- grouped_csv_pages = df.groupby('page')
754
 
755
  # Create a list to hold the JSON data
756
  json_data = []
@@ -758,7 +788,7 @@ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.I
758
  for n, pdf_image_path in enumerate(image_paths):
759
  reported_page_number = int(n + 1)
760
 
761
- if reported_page_number in df["page"].values:
762
 
763
  # Convert each relevant group to a list of box dictionaries
764
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
 
51
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
52
  print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
53
 
54
+ import os
55
+ from pdf2image import convert_from_path
56
+ from PIL import Image
57
+
58
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
59
  try:
60
+ # Construct the full output directory path
61
  output_dir = os.path.join(os.getcwd(), output_dir)
 
 
62
  out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
63
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
64
+
65
  if os.path.exists(out_path):
66
+ # Load existing image
67
  image = Image.open(out_path)
68
  else:
69
+ # Convert PDF page to image
70
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
71
  dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
72
  image = image_l[0]
73
  image = image.convert("L")
74
  image.save(out_path, format="PNG")
75
+
76
+ # Check file size and resize if necessary
77
+ max_size = 5 * 1024 * 1024 # 5 MB in bytes
78
+ file_size = os.path.getsize(out_path)
79
+ if file_size >= max_size:
80
+ # Resize the image while maintaining aspect ratio
81
+ ratio = (max_size / file_size) ** 0.5
82
+ new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
83
+ image = image.resize(new_size, Image.ANTIALIAS)
84
+ image.save(out_path, format="PNG") # Overwrite with resized image
85
+
86
  return page_num, out_path
87
+
88
  except Exception as e:
89
  print(f"Error processing page {page_num + 1}: {e}")
90
  return page_num, None
 
697
  print(final_df)
698
 
699
 
700
+ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
701
+ '''
702
+ Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
703
+ '''
704
  # Flatten the data
705
+ flattened_annotation_data = []
706
 
707
+ if not isinstance(redaction_decision_output, pd.DataFrame):
708
+ redaction_decision_output = pd.DataFrame()
709
+
710
+ for annotation in all_annotations:
711
+ #print("annotation:", annotation)
712
  #print("flattened_data:", flattened_data)
713
+ image_path = annotation["image"]
714
 
715
  # Use regex to find the number before .png
716
  match = re.search(r'_(\d+)\.png$', image_path)
 
721
  else:
722
  print("No number found before .png")
723
 
724
+ # Check if 'boxes' is in the annotation, if not, add an empty list
725
+ if 'boxes' not in annotation:
726
+ annotation['boxes'] = []
727
 
728
+ for box in annotation["boxes"]:
729
  if 'text' not in box:
730
+ data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
731
  else:
732
+ data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
733
  #print("data_to_add:", data_to_add)
734
+ flattened_annotation_data.append(data_to_add)
735
 
736
  # Convert to a DataFrame
737
+ annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
738
+
739
+ #print("redaction_decision_output:", redaction_decision_output)
740
+ #print("annotation_data_as_df:", annotation_data_as_df)
741
 
742
  # Join on additional text data from decision output results if included
743
+ if not redaction_decision_output.empty:
744
+ #print("redaction_decision_output is not empty")
745
+ #print("redaction_decision_output:", redaction_decision_output)
746
+ #print("annotation_data_as_df:", annotation_data_as_df)
747
+ redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
748
+ annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
749
+ redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
750
+
751
  # Round to the closest number divisible by 5
752
+ redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
753
+
754
+ redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
755
 
756
+ #annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
757
+
758
+ annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
759
 
760
+ annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
761
 
762
+ annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
763
 
764
+ annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
765
 
766
+ # Ensure required columns exist, filling with blank if they don't
767
+ for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
768
+ if col not in annotation_data_as_df.columns:
769
+ annotation_data_as_df[col] = ''
770
 
771
+ annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
772
 
773
+ return annotation_data_as_df
774
 
775
+ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
776
  '''
777
  Convert a review csv to a json file for use by the Gradio Annotation object
778
  '''
779
  # Keep only necessary columns
780
+ review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
781
 
782
  # Group the DataFrame by the 'image' column
783
+ grouped_csv_pages = review_file_df.groupby('page')
784
 
785
  # Create a list to hold the JSON data
786
  json_data = []
 
788
  for n, pdf_image_path in enumerate(image_paths):
789
  reported_page_number = int(n + 1)
790
 
791
+ if reported_page_number in review_file_df["page"].values:
792
 
793
  # Convert each relevant group to a list of box dictionaries
794
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
tools/file_redaction.py CHANGED
@@ -288,7 +288,7 @@ def choose_and_run_redactor(file_paths:List[str],
288
 
289
  print("Redacting file " + file_path_without_ext + " as an image-based file")
290
 
291
- pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
292
  prepared_pdf_image_paths,
293
  language,
294
  chosen_redact_entities,
@@ -314,6 +314,9 @@ def choose_and_run_redactor(file_paths:List[str],
314
  custom_recogniser_word_list,
315
  redact_whole_page_list)
316
 
 
 
 
317
  # Save Textract request metadata (if exists)
318
  if new_request_metadata:
319
  print("Request metadata:", new_request_metadata)
@@ -396,10 +399,11 @@ def choose_and_run_redactor(file_paths:List[str],
396
  json.dump(annotations_all_pages, f)
397
  log_files_output_paths.append(out_annotation_file_path)
398
 
399
- #print("Saving annotations to CSV")
400
 
401
  # Convert json to csv and also save this
402
  #print("annotations_all_pages:", annotations_all_pages)
 
403
 
404
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
405
 
@@ -975,11 +979,11 @@ def redact_image_pdf(file_path:str,
975
  if analysis_type == textract_option:
976
 
977
  json_file_path = output_folder + file_name + "_textract.json"
978
- log_files_output_paths.append(json_file_path)
979
 
980
  if not os.path.exists(json_file_path):
981
  print("No existing Textract results file found.")
982
- existing_data = {}
983
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
984
  #log_files_output_paths.append(json_file_path)
985
  #request_metadata = request_metadata + "\n" + new_request_metadata
@@ -988,8 +992,12 @@ def redact_image_pdf(file_path:str,
988
  # Open the file and load the JSON data
989
  no_textract_file = False
990
  print("Found existing Textract json results file.")
 
 
 
 
991
  with open(json_file_path, 'r') as json_file:
992
- existing_data = json.load(json_file)
993
 
994
  ###
995
 
@@ -1046,32 +1054,46 @@ def redact_image_pdf(file_path:str,
1046
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1047
  pdf_page_as_bytes = image_buffer.getvalue()
1048
 
1049
- if not existing_data:
1050
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1051
- log_files_output_paths.append(json_file_path)
1052
- request_metadata = request_metadata + "\n" + new_request_metadata
 
 
1053
 
1054
- existing_data = {"pages":[text_blocks]}
 
 
 
 
 
 
1055
 
1056
  else:
1057
  # Check if the current reported_page_number exists in the loaded JSON
1058
- page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
1059
 
1060
  if not page_exists: # If the page does not exist, analyze again
1061
  print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
1062
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
 
 
 
 
 
 
1063
 
1064
  # Check if "pages" key exists, if not, initialize it as an empty list
1065
- if "pages" not in existing_data:
1066
- existing_data["pages"] = []
1067
 
1068
  # Append the new page data
1069
- existing_data["pages"].append(text_blocks)
1070
 
1071
  request_metadata = request_metadata + "\n" + new_request_metadata
1072
  else:
1073
  # If the page exists, retrieve the data
1074
- text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1075
 
1076
 
1077
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
@@ -1214,7 +1236,10 @@ def redact_image_pdf(file_path:str,
1214
  if analysis_type == textract_option:
1215
  # Write the updated existing textract data back to the JSON file
1216
  with open(json_file_path, 'w') as json_file:
1217
- json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
 
 
 
1218
 
1219
  current_loop_page += 1
1220
 
@@ -1245,7 +1270,10 @@ def redact_image_pdf(file_path:str,
1245
  if analysis_type == textract_option:
1246
  # Write the updated existing textract data back to the JSON file
1247
  with open(json_file_path, 'w') as json_file:
1248
- json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
 
 
 
1249
 
1250
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1251
 
@@ -1253,7 +1281,9 @@ def redact_image_pdf(file_path:str,
1253
  # Write the updated existing textract data back to the JSON file
1254
 
1255
  with open(json_file_path, 'w') as json_file:
1256
- json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
 
 
1257
 
1258
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1259
 
@@ -1495,7 +1525,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1495
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1496
 
1497
  # Convert the new columns to integers (if needed)
1498
- analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
1499
 
1500
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1501
  analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
 
288
 
289
  print("Redacting file " + file_path_without_ext + " as an image-based file")
290
 
291
+ pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
292
  prepared_pdf_image_paths,
293
  language,
294
  chosen_redact_entities,
 
314
  custom_recogniser_word_list,
315
  redact_whole_page_list)
316
 
317
+
318
+ print("log_files_output_paths at end of image redact function:", log_files_output_paths)
319
+
320
  # Save Textract request metadata (if exists)
321
  if new_request_metadata:
322
  print("Request metadata:", new_request_metadata)
 
399
  json.dump(annotations_all_pages, f)
400
  log_files_output_paths.append(out_annotation_file_path)
401
 
402
+ print("Saving annotations to CSV")
403
 
404
  # Convert json to csv and also save this
405
  #print("annotations_all_pages:", annotations_all_pages)
406
+ #print("all_decision_process_table:", all_decision_process_table)
407
 
408
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
409
 
 
979
  if analysis_type == textract_option:
980
 
981
  json_file_path = output_folder + file_name + "_textract.json"
982
+
983
 
984
  if not os.path.exists(json_file_path):
985
  print("No existing Textract results file found.")
986
+ textract_data = {}
987
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
988
  #log_files_output_paths.append(json_file_path)
989
  #request_metadata = request_metadata + "\n" + new_request_metadata
 
992
  # Open the file and load the JSON data
993
  no_textract_file = False
994
  print("Found existing Textract json results file.")
995
+
996
+ if json_file_path not in log_files_output_paths:
997
+ log_files_output_paths.append(json_file_path)
998
+
999
  with open(json_file_path, 'r') as json_file:
1000
+ textract_data = json.load(json_file)
1001
 
1002
  ###
1003
 
 
1054
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1055
  pdf_page_as_bytes = image_buffer.getvalue()
1056
 
1057
+ if not textract_data:
1058
+ try:
1059
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1060
+
1061
+ if json_file_path not in log_files_output_paths:
1062
+ log_files_output_paths.append(json_file_path)
1063
 
1064
+ textract_data = {"pages":[text_blocks]}
1065
+ except Exception as e:
1066
+ print("Textract extraction for page", reported_page_number, "failed due to:", e)
1067
+ textract_data = {"pages":[]}
1068
+ new_request_metadata = "Failed Textract API call"
1069
+
1070
+ request_metadata = request_metadata + "\n" + new_request_metadata
1071
 
1072
  else:
1073
  # Check if the current reported_page_number exists in the loaded JSON
1074
+ page_exists = any(page['page_no'] == reported_page_number for page in textract_data.get("pages", []))
1075
 
1076
  if not page_exists: # If the page does not exist, analyze again
1077
  print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
1078
+
1079
+ try:
1080
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1081
+ except Exception as e:
1082
+ print("Textract extraction for page", reported_page_number, "failed due to:", e)
1083
+ text_bocks = []
1084
+ new_request_metadata = "Failed Textract API call"
1085
 
1086
  # Check if "pages" key exists, if not, initialize it as an empty list
1087
+ if "pages" not in textract_data:
1088
+ textract_data["pages"] = []
1089
 
1090
  # Append the new page data
1091
+ textract_data["pages"].append(text_blocks)
1092
 
1093
  request_metadata = request_metadata + "\n" + new_request_metadata
1094
  else:
1095
  # If the page exists, retrieve the data
1096
+ text_blocks = next(page['data'] for page in textract_data["pages"] if page['page_no'] == reported_page_number)
1097
 
1098
 
1099
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
 
1236
  if analysis_type == textract_option:
1237
  # Write the updated existing textract data back to the JSON file
1238
  with open(json_file_path, 'w') as json_file:
1239
+ json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1240
+
1241
+ if json_file_path not in log_files_output_paths:
1242
+ log_files_output_paths.append(json_file_path)
1243
 
1244
  current_loop_page += 1
1245
 
 
1270
  if analysis_type == textract_option:
1271
  # Write the updated existing textract data back to the JSON file
1272
  with open(json_file_path, 'w') as json_file:
1273
+ json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1274
+
1275
+ if json_file_path not in log_files_output_paths:
1276
+ log_files_output_paths.append(json_file_path)
1277
 
1278
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1279
 
 
1281
  # Write the updated existing textract data back to the JSON file
1282
 
1283
  with open(json_file_path, 'w') as json_file:
1284
+ json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1285
+ if json_file_path not in log_files_output_paths:
1286
+ log_files_output_paths.append(json_file_path)
1287
 
1288
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1289
 
 
1525
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1526
 
1527
  # Convert the new columns to integers (if needed)
1528
+ analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
1529
 
1530
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1531
  analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
tools/helper_functions.py CHANGED
@@ -17,7 +17,7 @@ def reset_state_vars():
17
  show_share_button=False,
18
  show_remove_button=False,
19
  interactive=False
20
- )
21
 
22
  def get_or_create_env_var(var_name, default_value):
23
  # Get the environment variable if it exists
 
17
  show_share_button=False,
18
  show_remove_button=False,
19
  interactive=False
20
+ ), [], []
21
 
22
  def get_or_create_env_var(var_name, default_value):
23
  # Get the environment variable if it exists
tools/redaction_review.py CHANGED
@@ -56,7 +56,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
58
 
59
- if recogniser_dataframe_gr.iloc[0,0] == "":
60
  try:
61
  review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
62
  #print("review_dataframe['label']", review_dataframe["label"])
 
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
58
 
59
+ if recogniser_dataframe_gr.empty:
60
  try:
61
  review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
62
  #print("review_dataframe['label']", review_dataframe["label"])