Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 21, 2024

Commit

93ac94f

1 Parent(s): 8c33828

Updated decision making output files, log locations

Browse files

Files changed (4) hide show

app.py +13 -7
tools/data_anonymise.py +33 -27
tools/file_conversion.py +1 -6
tools/file_redaction.py +142 -117

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 add_folder_to_path("tesseract/")
 add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
@@ -21,6 +24,9 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
@@ -35,10 +41,10 @@ with app:
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
-    feedback_logs_state = gr.State('feedback/log.csv')
-    feedback_s3_logs_loc_state = gr.State('feedback/')
-    usage_logs_state = gr.State('logs/log.csv')
-    usage_s3_logs_loc_state = gr.State('logs/')
     gr.Markdown(
     """
@@ -162,18 +168,18 @@ with app:
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
-    callback.setup([session_hash_textbox], "logs")
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], "feedback")
     pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
-    data_callback.setup([data_feedback_radio, data_further_details_text], "feedback")
     data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])

 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
+from datetime import datetime
+today_rev = datetime.now().strftime("%Y%m%d")
 add_folder_to_path("tesseract/")
 add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
+feedback_data_folder = 'feedback/' + today_rev + '/'
+logs_data_folder = 'logs/' + today_rev + '/'
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
+    feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
+    feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
+    usage_logs_state = gr.State(logs_data_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(logs_data_folder)
     gr.Markdown(
     """
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
+    callback.setup([session_hash_textbox], logs_data_folder)
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], feedback_data_folder)
     pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
+    data_callback.setup([data_feedback_radio, data_further_details_text], feedback_data_folder)
     data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])

tools/data_anonymise.py CHANGED Viewed

@@ -23,27 +23,7 @@ fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
-# Writing decision making process to file
-def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
-    """
-    Generate a detailed output of the decision process for entity recognition.
-    This function takes the results from the analyzer and the original data dictionary,
-    and produces a string output detailing the decision process for each recognized entity.
-    It includes information such as entity type, position, confidence score, and the context
-    in which the entity was found.
-    Args:
-        analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
-        df_dict (Dict[str, List[Any]]): The original data in dictionary format.
-    Returns:
-        str: A string containing the detailed decision process output.
-    """
-    decision_process_output = []
-    keys_to_keep = ['entity_type', 'start', 'end']
-    def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
         output = []
         if hasattr(result, 'value'):
@@ -66,29 +46,53 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
                 output.append(str(analysis_explanation))
         return output
-    #print("Analyser results:", analyzer_results)
     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
         print("Looking at result:", str(i))
         # If a single result
         if isinstance(result, RecognizerResult):
             decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
         # If a list of results
-        elif isinstance(result, List):
-            for x, recognizer_result in enumerate(result.recognizer_results):
                 decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
         else:
             try:
                 decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
             except Exception as e:
                 print(e)
     decision_process_output_str = '\n'.join(decision_process_output)
     return decision_process_output_str
@@ -220,6 +224,8 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     # Usage in the main function:
     decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
@@ -325,12 +331,12 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
-        decision_process_log_output_file = anon_xlsx_export_file_name + "decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
     else:
-        anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
         anon_df_out.to_csv(anon_export_file_name, index = None)
         decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"

 def fake_first_name(x):
     return fake.first_name()
+def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
         output = []
         if hasattr(result, 'value'):
                 output.append(str(analysis_explanation))
         return output
+# Writing decision making process to file
+def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
+    """
+    Generate a detailed output of the decision process for entity recognition.
+    This function takes the results from the analyzer and the original data dictionary,
+    and produces a string output detailing the decision process for each recognized entity.
+    It includes information such as entity type, position, confidence score, and the context
+    in which the entity was found.
+    Args:
+        analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
+        df_dict (Dict[str, List[Any]]): The original data in dictionary format.
+    Returns:
+        str: A string containing the detailed decision process output.
+    """
+    decision_process_output = []
+    keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
         print("Looking at result:", str(i))
+        print("result:\n\n", result)
         # If a single result
         if isinstance(result, RecognizerResult):
+            print("Processing recogniser result as RecognizerResult:", str(i))
             decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
         # If a list of results
+        elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
+            for x, recognizer_result in enumerate(result.recognizer_results):
+                print("Processing recogniser result as List:", str(i))
                 decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
         else:
             try:
+                print("Processing recogniser result in other:", str(i))
                 decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
             except Exception as e:
                 print(e)
     decision_process_output_str = '\n'.join(decision_process_output)
+    print("decision_process_output_str:\n\n", decision_process_output_str)
     return decision_process_output_str
     # Usage in the main function:
     decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
+    #print("decision_process_output_str:\n\n", decision_process_output_str)
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
+        decision_process_log_output_file = anon_xlsx_export_file_name + "_" + excel_sheet_name + "_decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
             f.write(decision_process_output_str)
     else:
+        anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
         anon_df_out.to_csv(anon_export_file_name, index = None)
         decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"

tools/file_conversion.py CHANGED Viewed

@@ -91,8 +91,6 @@ def process_file(file_path):
     return img_object
 def prepare_image_or_text_pdf(
     file_paths: List[str],
     in_redact_method: str,
@@ -123,9 +121,7 @@ def prepare_image_or_text_pdf(
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
-    #    out_message = [out_message]
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -189,7 +185,6 @@ def prepare_image_or_text_pdf(
     return out_message, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)

     return img_object
 def prepare_image_or_text_pdf(
     file_paths: List[str],
     in_redact_method: str,
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
+    #    out_message = [out_message]
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
     return out_message, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)

tools/file_redaction.py CHANGED Viewed

@@ -247,142 +247,167 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     return images, decision_process_output_str
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
-    combined_analyzer_results = []
-    analyser_explanations = []
     annotations_all_pages = []
-    analyzed_bounding_boxes_df = pd.DataFrame()
-    # Horizontal distance between PII bounding boxes under/equal they are combined into one
-    combine_pixel_dist = 100
     pdf = Pdf.open(filename)
     page_num = 0
-    #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
     for page in pdf.pages:
         print("Page number is:", page_num + 1)
         annotations_on_page = []
-        analyzed_bounding_boxes = []
         for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
-            analyzer_results = []
             for text_container in page_layout:
-                if isinstance(text_container, LTTextContainer):
-                    text_to_analyze = text_container.get_text()
-                    analyzer_results = []
-                    characters = []
-                    analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
-                                                            language=language,
-                                                            entities=chosen_redact_entities,
-                                                            score_threshold=score_threshold,
-                                                            return_decision_process=True,
-                                                            allow_list=allow_list)
-                    characters = [char                    # This is what we want to include in the list
-                            for line in text_container          # Loop through each line in text_container
-                            if isinstance(line, LTTextLine)    # Check if the line is an instance of LTTextLine
-                            for char in line]                   # Loop through each character in the line
-                            #if isinstance(char, LTChar)]  # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
-                    # if len(analyzer_results) > 0 and len(characters) > 0:
-                    #     analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
-                    #     combined_analyzer_results.extend(analyzer_results)
-                    # Inside the loop where you process analyzer_results:
-                    if len(analyzer_results) > 0 and len(characters) > 0:
-                        merged_bounding_boxes = []
-                        current_box = None
-                        current_y = None
-                        for result in analyzer_results:
-                            for char in characters[result.start : result.end]:
-                                if isinstance(char, LTChar):
-                                    char_box = list(char.bbox)
-                                    # Fix: Check if either current_y or current_box are None
-                                    if current_y is None or current_box is None:
-                                        # This is the first character, so initialize current_box and current_y
-                                        current_box = char_box
-                                        current_y = char_box[1]
-                                    else:  # Now we have previous values to compare
-                                        #print("Comparing values")
-                                        vertical_diff_bboxes = abs(char_box[1] - current_y)
-                                        horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-                                        #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
-                                        if (
-                                            vertical_diff_bboxes <= 5
-                                            and horizontal_diff_bboxes <= combine_pixel_dist
-                                        ):
-                                            old_right_pos = current_box[2]
-                                            current_box[2] = char_box[2]
-                                        else:
-                                            merged_bounding_boxes.append(
-                                                {"boundingBox": current_box, "result": result})
-                                            current_box = char_box
-                                            current_y = char_box[1]
-                            # Add the last box
-                            if current_box:
-                                merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
-                        if not merged_bounding_boxes:
-                            analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
-                        else:
-                            analyzed_bounding_boxes.extend(merged_bounding_boxes)
-                        combined_analyzer_results.extend(analyzer_results)
-            if len(analyzer_results) > 0:
-                #decision_process_output_str = generate_decision_process_output(analyzer_results, {'text':text_to_analyze})
-                #print("Decision process:", decision_process_output_str)
-                # Create summary df of annotations to be made
-                analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
-                analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
-                analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
-                analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
-                analyzed_bounding_boxes_df_new['page'] = page_num + 1
-                analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
-                print('analyzed_bounding_boxes_df:', analyzed_bounding_boxes_df)
-            for analyzed_bounding_box in analyzed_bounding_boxes:
-                bounding_box = analyzed_bounding_box["boundingBox"]
-                annotation = Dictionary(
-                    Type=Name.Annot,
-                    Subtype=Name.Square, #Name.Highlight,
-                    QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
-                    Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
-                    C=[0, 0, 0],
-                    IC=[0, 0, 0],
-                    CA=1, # Transparency
-                    T=analyzed_bounding_box["result"].entity_type,
-                    BS=Dictionary(
-                        W=0,                     # Border width: 1 point
-                        S=Name.S                # Border style: solid
-                    )
-                )
-                annotations_on_page.append(annotation)
-            annotations_all_pages.extend([annotations_on_page])
-            print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
             page.Annots = pdf.make_indirect(annotations_on_page)
             page_num += 1
-    return pdf, analyzed_bounding_boxes_df

     return images, decision_process_output_str
+def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
+    if isinstance(text_container, LTTextContainer):
+        text_to_analyze = text_container.get_text()
+        analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
+                                                language=language,
+                                                entities=chosen_redact_entities,
+                                                score_threshold=score_threshold,
+                                                return_decision_process=True,
+                                                allow_list=allow_list)
+        characters = [char
+                for line in text_container
+                if isinstance(line, LTTextLine)
+                for char in line]
+        return analyzer_results, characters
+    return [], []
+# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
+def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
+    analyzed_bounding_boxes = []
+    if len(analyzer_results) > 0 and len(characters) > 0:
+        merged_bounding_boxes = []
+        current_box = None
+        current_y = None
+        for i, result in enumerate(analyzer_results):
+            print("Considering result", str(i))
+            for char in characters[result.start : result.end]:
+                if isinstance(char, LTChar):
+                    char_box = list(char.bbox)
+                    if current_y is None or current_box is None:
+                        current_box = char_box
+                        current_y = char_box[1]
+                    else:
+                        vertical_diff_bboxes = abs(char_box[1] - current_y)
+                        horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+                        if (
+                            vertical_diff_bboxes <= 5
+                            and horizontal_diff_bboxes <= combine_pixel_dist
+                        ):
+                            current_box[2] = char_box[2]  # Extend the current box horizontally
+                        else:
+                            merged_bounding_boxes.append(
+                                {"boundingBox": current_box, "result": result})
+                            # Reset current_box and current_y after appending
+                            current_box = char_box
+                            current_y = char_box[1]
+            # After finishing with the current result, add the last box for this result
+            if current_box:
+                merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
+                current_box = None
+                current_y = None  # Reset for the next result
+        if not merged_bounding_boxes:
+            analyzed_bounding_boxes.extend(
+                {"boundingBox": char.bbox, "result": result}
+                for result in analyzer_results
+                for char in characters[result.start:result.end]
+                if isinstance(char, LTChar)
+            )
+        else:
+            analyzed_bounding_boxes.extend(merged_bounding_boxes)
+        print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
+    return analyzed_bounding_boxes
+def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
+    decision_process_table = pd.DataFrame()
+    if len(analyzer_results) > 0:
+        # Create summary df of annotations to be made
+        analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
+        analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
+        analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
+        analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
+        analyzed_bounding_boxes_df_new['page'] = page_num + 1
+        decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
+        print('\n\ndecision_process_table:\n\n', decision_process_table)
+    return decision_process_table
+def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
+    annotations_on_page = []
+    for analyzed_bounding_box in analyzed_bounding_boxes:
+        bounding_box = analyzed_bounding_box["boundingBox"]
+        annotation = Dictionary(
+            Type=Name.Annot,
+            Subtype=Name.Square, #Name.Highlight,
+            QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3],
+                        bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
+            Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
+            C=[0, 0, 0],
+            IC=[0, 0, 0],
+            CA=1, # Transparency
+            T=analyzed_bounding_box["result"].entity_type,
+            BS=Dictionary(
+                W=0,                     # Border width: 1 point
+                S=Name.S                # Border style: solid
+            )
+        )
+        annotations_on_page.append(annotation)
+    return annotations_on_page
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
+    decision_process_table_all_pages = []
+    combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     pdf = Pdf.open(filename)
     page_num = 0
     for page in pdf.pages:
         print("Page number is:", page_num + 1)
         annotations_on_page = []
+        decision_process_table_on_page = []
         for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
+            page_analyzer_results = []
+            page_analyzed_bounding_boxes = []
+            text_container_analyzer_results = []
+            text_container_analyzed_bounding_boxes = []
+            characters = []
             for text_container in page_layout:
+                text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
+                # Merge bounding boxes if very close together
+                text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
+                print("\n\nanalyzed_bounding_boxes_in_loop:", text_container_analyzed_bounding_boxes)
+                page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
+                page_analyzer_results.extend(text_container_analyzer_results)
+            print("analyzed_bounding_boxes_out_loop:\n\n", page_analyzed_bounding_boxes)
+            decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
+            annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
+            #print('\n\nannotations_on_page:', annotations_on_page)
+            # Make page annotations
             page.Annots = pdf.make_indirect(annotations_on_page)
+            annotations_all_pages.extend([annotations_on_page])
+            decision_process_table_all_pages.extend([decision_process_table_on_page])
+            print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
             page_num += 1
+    return pdf, decision_process_table_all_pages