Spaces:

seanpedrickcase
/

document_redaction

Running

seanpedrickcase commited on May 13, 2024

Commit

a63133d

1 Parent(s): 43287c3

Added some commentary to file conversion and redaction

Files changed (2) hide show

tools/file_conversion.py CHANGED Viewed

@@ -38,12 +38,15 @@ def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
     images = []
     # Open the PDF file
     for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)

     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
+    print("Number of pages in PDF: ", str(page_count))
     images = []
     # Open the PDF file
     for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+        print("Current page: ", str(page_num))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)

tools/file_redaction.py CHANGED Viewed

@@ -15,7 +15,9 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
     take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
     '''
-    progress(0, desc="Converting pages to image")
     image_paths = process_file(file_path)
@@ -25,10 +27,14 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
     images = []
     number_of_pages = len(image_paths)
-    progress(0.1, desc="Redacting pages")
     for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
         # Get the image to redact using PIL lib (pillow)
         image = image_paths[i] #Image.open(image_paths[i])

     take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
     '''
+    out_message = "Converting pages to image"
+    print(out_message)
+    progress(0, desc=out_message)
     image_paths = process_file(file_path)
     images = []
     number_of_pages = len(image_paths)
+    out_message = "Redacting pages"
+    print(out_message)
+    progress(0.1, desc=out_message)
     for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
+        print("Redacting page ", str(i + 1))
         # Get the image to redact using PIL lib (pillow)
         image = image_paths[i] #Image.open(image_paths[i])