Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 15

Commit

143e2cc

1 Parent(s): c3a8cd7

App should now resize images that are too large before sending to Textract. Textract now more robust to failure. Improved reliability of json conversion to review dataframe

Browse files

Files changed (6) hide show

app.py +1 -1
tools/custom_image_analyser_engine.py +3 -3
tools/file_conversion.py +69 -39
tools/file_redaction.py +50 -20
tools/helper_functions.py +1 -1
tools/redaction_review.py +1 -1

app.py CHANGED Viewed

@@ -321,7 +321,7 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\

     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -637,9 +637,9 @@ class CustomImageAnalyzerEngine:
                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
-                    print("result_reset_pos:", result_reset_pos)
-                    print("relevant_line_ocr_result:", relevant_line_ocr_result)
-                    print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
+                    #print("result_reset_pos:", result_reset_pos)
+                    #print("relevant_line_ocr_result:", relevant_line_ocr_result)
+                    #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

tools/file_conversion.py CHANGED Viewed

@@ -51,26 +51,40 @@ def is_pdf(filename):
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
 print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
-        # Construct the full output directory path relative to the current working directory
         output_dir = os.path.join(os.getcwd(), output_dir)
-        # Use the output_dir to construct the out_path
         out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
-            #print(f"Loading existing image for page {page_num + 1}")
             image = Image.open(out_path)
         else:
-            #print(f"Converting page {page_num + 1}")
             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
             image = image.convert("L")
             image.save(out_path, format="PNG")
         return page_num, out_path
     except Exception as e:
         print(f"Error processing page {page_num + 1}: {e}")
         return page_num, None
@@ -683,14 +697,20 @@ def join_values_within_threshold(df1, df2):
     print(final_df)
-def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
     # Flatten the data
-    flattened_data = []
-    for entry in data:
-        #print("entry:", entry)
         #print("flattened_data:", flattened_data)
-        image_path = entry["image"]
         # Use regex to find the number before .png
         match = re.search(r'_(\d+)\.png$', image_path)
@@ -701,56 +721,66 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
         else:
             print("No number found before .png")
-        # Check if 'boxes' is in the entry, if not, add an empty list
-        if 'boxes' not in entry:
-            entry['boxes'] = []
-        for box in entry["boxes"]:
             if 'text' not in box:
-                data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": entry['text'],
             else:
-                data_to_add = {"image": image_path, "page": reported_number, "text": entry['text'], **box}
             #print("data_to_add:", data_to_add)
-            flattened_data.append(data_to_add)
     # Convert to a DataFrame
-    df = pd.DataFrame(flattened_data)
     # Join on additional text data from decision output results if included
-    if not text_join_data.empty:
-        #print("text_join_data:", text_join_data)
-        #print("df:", df)
-        text_join_data['page'] = text_join_data['page'].astype(str)
-        df['page'] = df['page'].astype(str)
-        text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
         # Round to the closest number divisible by 5
-        text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = (text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
-        text_join_data = text_join_data.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
-        df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
-        df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
-        df = df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
-        df = df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
-    if 'text' not in df.columns:
-        df['text'] = ''
-    df = df.sort_values(['page', 'ymin', 'xmin', 'label'])
-    return df
-def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
     '''
     Convert a review csv to a json file for use by the Gradio Annotation object
     '''
     # Keep only necessary columns
-    df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
-    grouped_csv_pages = df.groupby('page')
     # Create a list to hold the JSON data
     json_data = []
@@ -758,7 +788,7 @@ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.I
     for n, pdf_image_path in enumerate(image_paths):
         reported_page_number = int(n + 1)
-        if reported_page_number in df["page"].values:
             # Convert each relevant group to a list of box dictionaries
             selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)

 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
 print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
+import os
+from pdf2image import convert_from_path
+from PIL import Image
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
+        # Construct the full output directory path
         output_dir = os.path.join(os.getcwd(), output_dir)
         out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
+            # Load existing image
             image = Image.open(out_path)
         else:
+            # Convert PDF page to image
             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
             image = image.convert("L")
             image.save(out_path, format="PNG")
+        # Check file size and resize if necessary
+        max_size = 5 * 1024 * 1024  # 5 MB in bytes
+        file_size = os.path.getsize(out_path)
+        if file_size >= max_size:
+            # Resize the image while maintaining aspect ratio
+            ratio = (max_size / file_size) ** 0.5
+            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
+            image = image.resize(new_size, Image.ANTIALIAS)
+            image.save(out_path, format="PNG")  # Overwrite with resized image
         return page_num, out_path
     except Exception as e:
         print(f"Error processing page {page_num + 1}: {e}")
         return page_num, None
     print(final_df)
+def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
+    '''
+    Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
+    '''
     # Flatten the data
+    flattened_annotation_data = []
+    if not isinstance(redaction_decision_output, pd.DataFrame):
+        redaction_decision_output = pd.DataFrame()
+    for annotation in all_annotations:
+        #print("annotation:", annotation)
         #print("flattened_data:", flattened_data)
+        image_path = annotation["image"]
         # Use regex to find the number before .png
         match = re.search(r'_(\d+)\.png$', image_path)
         else:
             print("No number found before .png")
+        # Check if 'boxes' is in the annotation, if not, add an empty list
+        if 'boxes' not in annotation:
+            annotation['boxes'] = []
+        for box in annotation["boxes"]:
             if 'text' not in box:
+                data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": annotation['text'],
             else:
+                data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
             #print("data_to_add:", data_to_add)
+            flattened_annotation_data.append(data_to_add)
     # Convert to a DataFrame
+    annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
+    #print("redaction_decision_output:", redaction_decision_output)
+    #print("annotation_data_as_df:", annotation_data_as_df)
     # Join on additional text data from decision output results if included
+    if not redaction_decision_output.empty:
+        #print("redaction_decision_output is not empty")
+        #print("redaction_decision_output:", redaction_decision_output)
+        #print("annotation_data_as_df:", annotation_data_as_df)
+        redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
+        annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
+        redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
         # Round to the closest number divisible by 5
+        redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
+        redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
+        #annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
+        annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
+        annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
+        annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
+        annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
+    # Ensure required columns exist, filling with blank if they don't
+    for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
+        if col not in annotation_data_as_df.columns:
+            annotation_data_as_df[col] = ''
+    annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
+    return annotation_data_as_df
+def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
     '''
     Convert a review csv to a json file for use by the Gradio Annotation object
     '''
     # Keep only necessary columns
+    review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
+    grouped_csv_pages = review_file_df.groupby('page')
     # Create a list to hold the JSON data
     json_data = []
     for n, pdf_image_path in enumerate(image_paths):
         reported_page_number = int(n + 1)
+        if reported_page_number in review_file_df["page"].values:
             # Convert each relevant group to a list of box dictionaries
             selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)

tools/file_redaction.py CHANGED Viewed

@@ -288,7 +288,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
@@ -314,6 +314,9 @@ def choose_and_run_redactor(file_paths:List[str],
              custom_recogniser_word_list,
              redact_whole_page_list)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 print("Request metadata:", new_request_metadata)
@@ -396,10 +399,11 @@ def choose_and_run_redactor(file_paths:List[str],
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
-                #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 #print("annotations_all_pages:", annotations_all_pages)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
@@ -975,11 +979,11 @@ def redact_image_pdf(file_path:str,
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
-        log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
-            existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
             #log_files_output_paths.append(json_file_path)
             #request_metadata = request_metadata + "\n" + new_request_metadata
@@ -988,8 +992,12 @@ def redact_image_pdf(file_path:str,
             # Open the file and load the JSON data
             no_textract_file = False
             print("Found existing Textract json results file.")
             with open(json_file_path, 'r') as json_file:
-                existing_data = json.load(json_file)
     ###
@@ -1046,32 +1054,46 @@ def redact_image_pdf(file_path:str,
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                 pdf_page_as_bytes = image_buffer.getvalue()
-                if not existing_data:
-                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                    log_files_output_paths.append(json_file_path)
-                    request_metadata = request_metadata + "\n" + new_request_metadata
-                    existing_data = {"pages":[text_blocks]}
                 else:
                     # Check if the current reported_page_number exists in the loaded JSON
-                    page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
                     if not page_exists:  # If the page does not exist, analyze again
                         print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
-                        text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                         # Check if "pages" key exists, if not, initialize it as an empty list
-                        if "pages" not in existing_data:
-                            existing_data["pages"] = []
                         # Append the new page data
-                        existing_data["pages"].append(text_blocks)
                         request_metadata = request_metadata + "\n" + new_request_metadata
                     else:
                         # If the page exists, retrieve the data
-                        text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
@@ -1214,7 +1236,10 @@ def redact_image_pdf(file_path:str,
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
                     with open(json_file_path, 'w') as json_file:
-                        json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 current_loop_page += 1
@@ -1245,7 +1270,10 @@ def redact_image_pdf(file_path:str,
             if analysis_type == textract_option:
                 # Write the updated existing textract data back to the JSON file
                 with open(json_file_path, 'w') as json_file:
-                    json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
             return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1253,7 +1281,9 @@ def redact_image_pdf(file_path:str,
         # Write the updated existing textract data back to the JSON file
         with open(json_file_path, 'w') as json_file:
-            json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
     return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1495,7 +1525,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         # Convert the new columns to integers (if needed)
-        analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
         analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
         analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]

             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
              custom_recogniser_word_list,
              redact_whole_page_list)
+            print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 print("Request metadata:", new_request_metadata)
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
+                print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 #print("annotations_all_pages:", annotations_all_pages)
+                #print("all_decision_process_table:", all_decision_process_table)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
+            textract_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
             #log_files_output_paths.append(json_file_path)
             #request_metadata = request_metadata + "\n" + new_request_metadata
             # Open the file and load the JSON data
             no_textract_file = False
             print("Found existing Textract json results file.")
+            if json_file_path not in log_files_output_paths:
+                log_files_output_paths.append(json_file_path)
             with open(json_file_path, 'r') as json_file:
+                textract_data = json.load(json_file)
     ###
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                 pdf_page_as_bytes = image_buffer.getvalue()
+                if not textract_data:
+                    try:
+                        text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                        if json_file_path not in log_files_output_paths:
+                            log_files_output_paths.append(json_file_path)
+                        textract_data = {"pages":[text_blocks]}
+                    except Exception as e:
+                        print("Textract extraction for page", reported_page_number, "failed due to:", e)
+                        textract_data = {"pages":[]}
+                        new_request_metadata = "Failed Textract API call"
+                    request_metadata = request_metadata + "\n" + new_request_metadata
                 else:
                     # Check if the current reported_page_number exists in the loaded JSON
+                    page_exists = any(page['page_no'] == reported_page_number for page in textract_data.get("pages", []))
                     if not page_exists:  # If the page does not exist, analyze again
                         print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
+                        try:
+                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                        except Exception as e:
+                            print("Textract extraction for page", reported_page_number, "failed due to:", e)
+                            text_bocks = []
+                            new_request_metadata = "Failed Textract API call"
                         # Check if "pages" key exists, if not, initialize it as an empty list
+                        if "pages" not in textract_data:
+                            textract_data["pages"] = []
                         # Append the new page data
+                        textract_data["pages"].append(text_blocks)
                         request_metadata = request_metadata + "\n" + new_request_metadata
                     else:
                         # If the page exists, retrieve the data
+                        text_blocks = next(page['data'] for page in textract_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
                     with open(json_file_path, 'w') as json_file:
+                        json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                        if json_file_path not in log_files_output_paths:
+                            log_files_output_paths.append(json_file_path)
                 current_loop_page += 1
             if analysis_type == textract_option:
                 # Write the updated existing textract data back to the JSON file
                 with open(json_file_path, 'w') as json_file:
+                    json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                    if json_file_path not in log_files_output_paths:
+                        log_files_output_paths.append(json_file_path)
             return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         # Write the updated existing textract data back to the JSON file
         with open(json_file_path, 'w') as json_file:
+            json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+            if json_file_path not in log_files_output_paths:
+                log_files_output_paths.append(json_file_path)
     return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         # Convert the new columns to integers (if needed)
+        analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
         analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
         analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]

tools/helper_functions.py CHANGED Viewed

@@ -17,7 +17,7 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        )
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], []
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

tools/redaction_review.py CHANGED Viewed

@@ -56,7 +56,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
     recogniser_entities = []
     recogniser_dataframe = pd.DataFrame()
-    if recogniser_dataframe_gr.iloc[0,0] == "":
         try:
             review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
             #print("review_dataframe['label']", review_dataframe["label"])

     recogniser_entities = []
     recogniser_dataframe = pd.DataFrame()
+    if recogniser_dataframe_gr.empty:
         try:
             review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
             #print("review_dataframe['label']", review_dataframe["label"])