Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 3, 2024

Commit

bc4bdbd

1 Parent(s): bbf818d

Can now select only specific pages in document to redact. Image based redaction should work correctly now.

Browse files

Files changed (4) hide show

app.py +30 -8
requirements.txt +3 -3
tools/file_conversion.py +9 -4
tools/file_redaction.py +112 -52

app.py CHANGED Viewed

@@ -27,6 +27,19 @@ language = 'en'
 feedback_data_folder = 'feedback/' + today_rev + '/'
 logs_data_folder = 'logs/' + today_rev + '/'
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
@@ -42,16 +55,20 @@ with app:
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     usage_logs_state = gr.State(logs_data_folder + 'log.csv')
     usage_s3_logs_loc_state = gr.State(logs_data_folder)
     gr.Markdown(
     """
     # Document redaction
-    Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
     WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
@@ -115,6 +132,9 @@ with app:
     """)
         with gr.Accordion("Settings for documents", open = True):
             in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
@@ -143,12 +163,12 @@ with app:
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
@@ -162,9 +182,11 @@ with app:
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
-    #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
-    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
@@ -190,6 +212,6 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
-        app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='10mb')
     else:
-        app.queue().launch(show_error=True, inbrowser=True, max_file_size='10mb')

 feedback_data_folder = 'feedback/' + today_rev + '/'
 logs_data_folder = 'logs/' + today_rev + '/'
+def create_logs_folder(session_hash_textbox):
+    print("session_hash_textbox", session_hash_textbox)
+    feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
+    logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
+    feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
+    feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
+    usage_logs_state = gr.State(logs_data_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(logs_data_folder)
+    return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
     feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
     usage_logs_state = gr.State(logs_data_folder + 'log.csv')
     usage_s3_logs_loc_state = gr.State(logs_data_folder)
     gr.Markdown(
     """
     # Document redaction
+    Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction. If you are getting 0 redactions, it's possible that the text in the document is saved in image format instead of as selectable text. Select 'Image analysis' on the Settings page in this case.
     WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
     """)
         with gr.Accordion("Settings for documents", open = True):
             in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
+            with gr.Row():
+                page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
+                page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
+    #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
+    #then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
+        app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
     else:
+        app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
 pdfminer.six==20231228
 pdf2image==1.17.0
 opencv-python==4.9.0.80
-presidio_analyzer==2.2.354
-presidio_anonymizer==2.2.354
-presidio-image-redactor==0.0.52
 pikepdf==8.15.1
 pandas==2.2.2
 spacy==3.7.5

 pdfminer.six==20231228
 pdf2image==1.17.0
 opencv-python==4.9.0.80
+presidio_analyzer==2.2.355
+presidio_anonymizer==2.2.355
+presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.2
 spacy==3.7.5

tools/file_conversion.py CHANGED Viewed

@@ -36,7 +36,7 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -46,21 +46,26 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
     # Open the PDF file
     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
-    for page_num in range(0,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
         # print("Current page: ", str(page_num + 1))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
         # If no images are returned, break the loop
         if not image:
             print("Conversion of page", str(page_num), "to file failed.")
             break
         images.extend(image)
     print("PDF has been converted to images.")
     return images
@@ -146,7 +151,7 @@ def prepare_image_or_text_pdf(
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     file_paths_loop = [file_paths[int(latest_file_completed)]]
-    print("file_paths_loop:", str(file_paths_loop))
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
@@ -169,7 +174,7 @@ def prepare_image_or_text_pdf(
                 return out_message, out_file_paths
             out_file_path = process_file(file_path)
-            print("Out file path at image conversion step:", out_file_path)
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:

 # %%
 ## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
     # Open the PDF file
     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+    for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
         # print("Current page: ", str(page_num + 1))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
         # If no images are returned, break the loop
         if not image:
             print("Conversion of page", str(page_num), "to file failed.")
             break
+        # print("Conversion of page", str(page_num), "to file succeeded.")
+        # print("image:", image)
         images.extend(image)
     print("PDF has been converted to images.")
+    # print("Images:", images)
     return images
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     file_paths_loop = [file_paths[int(latest_file_completed)]]
+    #print("file_paths_loop:", str(file_paths_loop))
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
                 return out_message, out_file_paths
             out_file_path = process_file(file_path)
+            #print("Out file path at image conversion step:", out_file_path)
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:

tools/file_redaction.py CHANGED Viewed

@@ -18,7 +18,7 @@ from tools.data_anonymise import generate_decision_process_output
 import gradio as gr
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
@@ -73,7 +73,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
             print("Redacting file as image-based file")
-            pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
@@ -97,7 +97,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
@@ -175,12 +175,13 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
                 merged_bboxes.append(merged_box)
             return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
@@ -190,59 +191,101 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
         image_paths = process_file(file_path)
     images = []
-    number_of_pages = len(image_paths)
     out_message = "Redacting pages"
     print(out_message)
     #progress(0.1, desc=out_message)
-    #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
-    for i in range(0, number_of_pages):
-        print("Redacting page", str(i + 1))
-        # Get the image to redact using PIL lib (pillow)
-        #print("image_paths:", image_paths)
-        image = ImageChops.duplicate(image_paths[i])
-        # %%
-        image_analyser = ImageAnalyzerEngine(nlp_analyser)
-        engine = ImageRedactorEngine(image_analyser)
-        if language == 'en':
-            ocr_lang = 'eng'
-        else: ocr_lang = language
-        bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
-                **{
-                "allow_list": allow_list,
-                "language": language,
-                "entities": chosen_redact_entities,
-                "score_threshold": score_threshold,
-                "return_decision_process":True,
-            })
-        # Text placeholder in this processing step, as the analyze method does not return the OCR text
-        if bboxes:
-            decision_process_output_str = str(bboxes)
-            print("Decision process:", decision_process_output_str)
-        #print("For page: ", str(i), "Bounding boxes: ", bboxes)
-        draw = ImageDraw.Draw(image)
-        merged_bboxes = merge_img_bboxes(bboxes)
-        #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
-        # 3. Draw the merged boxes (unchanged)
-        for box in merged_bboxes:
-            x0 = box.left
-            y0 = box.top
-            x1 = x0 + box.width
-            y1 = y0 + box.height
-            draw.rectangle([x0, y0, x1, y1], fill=fill)
         images.append(image)
@@ -358,7 +401,7 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
         annotations_on_page.append(annotation)
     return annotations_on_page
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
@@ -370,13 +413,30 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     pdf = Pdf.open(filename)
     page_num = 0
-    for page in pdf.pages:
-        print("Page number is:", page_num + 1)
         annotations_on_page = []
         decision_process_table_on_page = []
-        for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
             page_analyzer_results = []
             page_analyzed_bounding_boxes = []
@@ -403,8 +463,8 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             annotations_all_pages.extend([annotations_on_page])
             decision_process_table_all_pages.extend([decision_process_table_on_page])
-            print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
-            page_num += 1
     return pdf, decision_process_table_all_pages

 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
             print("Redacting file as image-based file")
+            pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max)
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
                 merged_bboxes.append(merged_box)
             return merged_bboxes
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)
+    decision_process_output_str = ""
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         image_paths = process_file(file_path)
     images = []
+    #print("Image paths:", image_paths)
+    number_of_pages = len(image_paths[0])
+    print("Number of pages:", str(number_of_pages))
     out_message = "Redacting pages"
     print(out_message)
     #progress(0.1, desc=out_message)
+    # Check that page_min and page_max are within expected ranges
+    if page_max > number_of_pages or page_max == 0:
+        page_max = number_of_pages
+    #else:
+    #    page_max = page_max - 1
+    if page_min <= 0:
+        page_min = 0
+    else:
+        page_min = page_min - 1
+    print("Page range:", str(page_min), "to", str(page_max))
+    #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
+    for n in range(0, number_of_pages):
+        try:
+            image = image_paths[0][n]#.copy()
+            print("Skipping page", str(n))
+            #print("image:", image)
+        except Exception as e:
+            print("Could not redact page:", str(i), "due to:")
+            print(e)
+            continue
+        if n >= page_min and n <= page_max:
+        #for i in range(page_min, page_max):
+            i = n
+            print("Redacting page", str(i))
+            # Get the image to redact using PIL lib (pillow)
+            #print("image_paths:", image_paths)
+            #image = ImageChops.duplicate(image_paths[i])
+            #print("Image paths i:", image_paths[0])
+            # Assuming image_paths[i] is your PIL image object
+            try:
+                image = image_paths[0][i]#.copy()
+                #print("image:", image)
+            except Exception as e:
+                print("Could not redact page:", str(i), "due to:")
+                print(e)
+                continue
+            # %%
+            image_analyser = ImageAnalyzerEngine(nlp_analyser)
+            engine = ImageRedactorEngine(image_analyser)
+            if language == 'en':
+                ocr_lang = 'eng'
+            else: ocr_lang = language
+            bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
+                    **{
+                    "allow_list": allow_list,
+                    "language": language,
+                    "entities": chosen_redact_entities,
+                    "score_threshold": score_threshold,
+                    "return_decision_process":True,
+                })
+            # Text placeholder in this processing step, as the analyze method does not return the OCR text
+            if bboxes:
+                decision_process_output_str = str(bboxes)
+                print("Decision process:", decision_process_output_str)
+            #print("For page: ", str(i), "Bounding boxes: ", bboxes)
+            draw = ImageDraw.Draw(image)
+            merged_bboxes = merge_img_bboxes(bboxes)
+            #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
+            # 3. Draw the merged boxes (unchanged)
+            for box in merged_bboxes:
+                x0 = box.left
+                y0 = box.top
+                x1 = x0 + box.width
+                y1 = y0 + box.height
+                draw.rectangle([x0, y0, x1, y1], fill=fill)
         images.append(image)
         annotations_on_page.append(annotation)
     return annotations_on_page
+def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     pdf = Pdf.open(filename)
     page_num = 0
+    number_of_pages = len(pdf.pages)
+    # Check that page_min and page_max are within expected ranges
+    if page_max > number_of_pages or page_max == 0:
+        page_max = number_of_pages
+    #else:
+    #    page_max = page_max - 1
+    if page_min <= 0:
+        page_min = 0
+    else:
+        page_min = page_min - 1
+    print("Page range is",str(page_min), "to", str(page_max))
+    for page_no in range(page_min, page_max):
+        page = pdf.pages[page_no]
+        print("Page number is:", page_no)
         annotations_on_page = []
         decision_process_table_on_page = []
+        for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
             page_analyzer_results = []
             page_analyzed_bounding_boxes = []
             annotations_all_pages.extend([annotations_on_page])
             decision_process_table_all_pages.extend([decision_process_table_on_page])
+            print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
+            #page_num += 1
     return pdf, decision_process_table_all_pages