Commit 
							
							·
						
						e5dfae7
	
1
								Parent(s):
							
							e2aae24
								
Added option for running redact function through CLI (i.e. not going through Gradio UI or API). Test functions for running this through AWS Lambda.
Browse files- Dockerfile +7 -1
- app.py +19 -4
- entrypoint_router.py +23 -0
- lambda_entrypoint.py +66 -0
- tools/aws_functions.py +1 -1
- tools/cli_redact.py +83 -0
- tools/file_conversion.py +22 -5
- tools/file_redaction.py +12 -2
- tools/redaction_review.py +1 -1
    	
        Dockerfile
    CHANGED
    
    | @@ -14,6 +14,9 @@ RUN pip install --no-cache-dir --target=/install -r requirements.txt | |
| 14 |  | 
| 15 | 
             
            RUN rm requirements.txt
         | 
| 16 |  | 
|  | |
|  | |
|  | |
| 17 | 
             
            # Stage 2: Final runtime image
         | 
| 18 | 
             
            FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
         | 
| 19 |  | 
| @@ -62,4 +65,7 @@ WORKDIR $HOME/app | |
| 62 | 
             
            # Copy the current directory contents into the container at $HOME/app setting the owner to the user
         | 
| 63 | 
             
            COPY --chown=user . $HOME/app
         | 
| 64 |  | 
| 65 | 
            -
             | 
|  | |
|  | |
|  | 
|  | |
| 14 |  | 
| 15 | 
             
            RUN rm requirements.txt
         | 
| 16 |  | 
| 17 | 
            +
            # Add lambda_entrypoint.py to the container
         | 
| 18 | 
            +
            COPY lambda_entrypoint.py .
         | 
| 19 | 
            +
             | 
| 20 | 
             
            # Stage 2: Final runtime image
         | 
| 21 | 
             
            FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
         | 
| 22 |  | 
|  | |
| 65 | 
             
            # Copy the current directory contents into the container at $HOME/app setting the owner to the user
         | 
| 66 | 
             
            COPY --chown=user . $HOME/app
         | 
| 67 |  | 
| 68 | 
            +
            # Keep the default entrypoint as flexible
         | 
| 69 | 
            +
            ENTRYPOINT ["python", "-u", "entrypoint_router.py"]
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            #CMD ["python", "app.py"]
         | 
    	
        app.py
    CHANGED
    
    | @@ -364,7 +364,7 @@ with app: | |
| 364 |  | 
| 365 | 
             
                # If running on AWS, load in the default allow list file from S3
         | 
| 366 | 
             
                if RUN_AWS_FUNCTIONS == "1":
         | 
| 367 | 
            -
                    print("default_allow_list_output_folder_location:",  | 
| 368 | 
             
                    if not os.path.exists(default_allow_list_loc):
         | 
| 369 | 
             
                        app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
         | 
| 370 | 
             
                        then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
         | 
| @@ -399,11 +399,26 @@ with app: | |
| 399 | 
             
            COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
         | 
| 400 | 
             
            print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
         | 
| 401 |  | 
|  | |
|  | |
|  | |
| 402 | 
             
            if __name__ == "__main__":
         | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 405 | 
             
                else:
         | 
| 406 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
| 407 |  | 
| 408 |  | 
| 409 | 
             
            # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
         | 
|  | |
| 364 |  | 
| 365 | 
             
                # If running on AWS, load in the default allow list file from S3
         | 
| 366 | 
             
                if RUN_AWS_FUNCTIONS == "1":
         | 
| 367 | 
            +
                    print("default_allow_list_output_folder_location:", default_allow_list_loc)
         | 
| 368 | 
             
                    if not os.path.exists(default_allow_list_loc):
         | 
| 369 | 
             
                        app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
         | 
| 370 | 
             
                        then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
         | 
|  | |
| 399 | 
             
            COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
         | 
| 400 | 
             
            print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
         | 
| 401 |  | 
| 402 | 
            +
            RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
         | 
| 403 | 
            +
            print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
         | 
| 404 | 
            +
             | 
| 405 | 
             
            if __name__ == "__main__":
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                if RUN_DIRECT_MODE == "0":
         | 
| 408 | 
            +
                    max_queue_size = 5
         | 
| 409 | 
            +
                    max_file_size = '100mb'
         | 
| 410 | 
            +
             | 
| 411 | 
            +
                    if os.environ['COGNITO_AUTH'] == "1":
         | 
| 412 | 
            +
                        app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
         | 
| 413 | 
            +
                    else:
         | 
| 414 | 
            +
                        app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)
         | 
| 415 | 
            +
                
         | 
| 416 | 
             
                else:
         | 
| 417 | 
            +
                    from tools.cli_redact import main
         | 
| 418 | 
            +
             | 
| 419 | 
            +
                    main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None, 
         | 
| 420 | 
            +
                     log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0, 
         | 
| 421 | 
            +
                     current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
         | 
| 422 |  | 
| 423 |  | 
| 424 | 
             
            # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
         | 
    	
        entrypoint_router.py
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import subprocess
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            if __name__ == "__main__":
         | 
| 5 | 
            +
                run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                if run_direct_mode == "1":
         | 
| 8 | 
            +
                    # Lambda execution or CLI invocation (Direct Mode)
         | 
| 9 | 
            +
                    from lambda_entrypoint import lambda_handler
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                    # Simulate the Lambda event and context for local testing
         | 
| 12 | 
            +
                    event = os.getenv("LAMBDA_TEST_EVENT", '{}')
         | 
| 13 | 
            +
                    context = None  # Add mock context if needed
         | 
| 14 | 
            +
                    response = lambda_handler(eval(event), context)
         | 
| 15 | 
            +
                    print(response)
         | 
| 16 | 
            +
                else:
         | 
| 17 | 
            +
                    # Gradio App execution
         | 
| 18 | 
            +
                    from app import app  # Replace with actual import if needed
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    if os.getenv("COGNITO_AUTH", "0") == "1":
         | 
| 21 | 
            +
                        app.queue(max_size=app.max_queue_size).launch(show_error=True, auth=app.authenticate_user, max_file_size=app.max_file_size)
         | 
| 22 | 
            +
                    else:
         | 
| 23 | 
            +
                        app.queue(max_size=app.max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=app.max_file_size)
         | 
    	
        lambda_entrypoint.py
    ADDED
    
    | @@ -0,0 +1,66 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import boto3
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import subprocess
         | 
| 4 | 
            +
            from urllib.parse import unquote_plus
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            s3_client = boto3.client("s3")
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            def download_file_from_s3(bucket_name, key, download_path):
         | 
| 9 | 
            +
                """Download a file from S3 to the local filesystem."""
         | 
| 10 | 
            +
                s3_client.download_file(bucket_name, key, download_path)
         | 
| 11 | 
            +
                print(f"Downloaded {key} to {download_path}")
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            def upload_file_to_s3(file_path, bucket_name, key):
         | 
| 14 | 
            +
                """Upload a file to S3."""
         | 
| 15 | 
            +
                s3_client.upload_file(file_path, bucket_name, key)
         | 
| 16 | 
            +
                print(f"Uploaded {file_path} to {key}")
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            def lambda_handler(event, context):
         | 
| 19 | 
            +
                """Main Lambda function handler."""
         | 
| 20 | 
            +
                # Parse the S3 event
         | 
| 21 | 
            +
                for record in event["Records"]:
         | 
| 22 | 
            +
                    bucket_name = record["s3"]["bucket"]["name"]
         | 
| 23 | 
            +
                    input_key = unquote_plus(record["s3"]["object"]["key"])
         | 
| 24 | 
            +
                    print(f"Processing file {input_key} from bucket {bucket_name}")
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    # Prepare paths
         | 
| 27 | 
            +
                    input_file_path = f"/tmp/{os.path.basename(input_key)}"
         | 
| 28 | 
            +
                    allow_list_path = f"/tmp/allow_list.csv"  # Adjust this as needed
         | 
| 29 | 
            +
                    output_dir = "/tmp/output"
         | 
| 30 | 
            +
                    os.makedirs(output_dir, exist_ok=True)
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    # Download input file
         | 
| 33 | 
            +
                    download_file_from_s3(bucket_name, input_key, input_file_path)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    # (Optional) Download allow_list if needed
         | 
| 36 | 
            +
                    allow_list_key = "path/to/allow_list.csv"  # Adjust path as required
         | 
| 37 | 
            +
                    download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    # Construct and run the command
         | 
| 40 | 
            +
                    command = [
         | 
| 41 | 
            +
                        "python",
         | 
| 42 | 
            +
                        "app.py",
         | 
| 43 | 
            +
                        "--input_file", input_file_path,
         | 
| 44 | 
            +
                        "--ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)",
         | 
| 45 | 
            +
                        "--pii_detector", "AWS Comprehend",
         | 
| 46 | 
            +
                        "--page_min", "0",
         | 
| 47 | 
            +
                        "--page_max", "0",
         | 
| 48 | 
            +
                        "--allow_list", allow_list_path,
         | 
| 49 | 
            +
                        "--output_dir", output_dir,
         | 
| 50 | 
            +
                    ]
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    try:
         | 
| 53 | 
            +
                        result = subprocess.run(command, capture_output=True, text=True, check=True)
         | 
| 54 | 
            +
                        print("Processing succeeded:", result.stdout)
         | 
| 55 | 
            +
                    except subprocess.CalledProcessError as e:
         | 
| 56 | 
            +
                        print("Error during processing:", e.stderr)
         | 
| 57 | 
            +
                        raise e
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    # Upload output files back to S3
         | 
| 60 | 
            +
                    for root, _, files in os.walk(output_dir):
         | 
| 61 | 
            +
                        for file_name in files:
         | 
| 62 | 
            +
                            local_file_path = os.path.join(root, file_name)
         | 
| 63 | 
            +
                            output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
         | 
| 64 | 
            +
                            upload_file_to_s3(local_file_path, bucket_name, output_key)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                return {"statusCode": 200, "body": "Processing complete."}
         | 
    	
        tools/aws_functions.py
    CHANGED
    
    | @@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame] | |
| 10 | 
             
            # Get AWS credentials
         | 
| 11 | 
             
            bucket_name=""
         | 
| 12 |  | 
| 13 | 
            -
            RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", " | 
| 14 | 
             
            print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
         | 
| 15 |  | 
| 16 | 
             
            AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
         | 
|  | |
| 10 | 
             
            # Get AWS credentials
         | 
| 11 | 
             
            bucket_name=""
         | 
| 12 |  | 
| 13 | 
            +
            RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
         | 
| 14 | 
             
            print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
         | 
| 15 |  | 
| 16 | 
             
            AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
         | 
    	
        tools/cli_redact.py
    ADDED
    
    | @@ -0,0 +1,83 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
         | 
| 4 | 
            +
            from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
         | 
| 5 | 
            +
            from tools.file_redaction import choose_and_run_redactor
         | 
| 6 | 
            +
            import pandas as pd
         | 
| 7 | 
            +
            from datetime import datetime
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV',          'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
         | 
| 10 | 
            +
                                            'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
         | 
| 11 | 
            +
                                            'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
         | 
| 12 | 
            +
                                            'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
         | 
| 13 | 
            +
                                            'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
         | 
| 14 | 
            +
                                            'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
         | 
| 15 | 
            +
            chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", 
         | 
| 16 | 
            +
                                        "STREETNAME", "UKPOSTCODE"]
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None, 
         | 
| 19 | 
            +
                     log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0, 
         | 
| 20 | 
            +
                     current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
         | 
| 21 | 
            +
                
         | 
| 22 | 
            +
                if output_file_list is None:
         | 
| 23 | 
            +
                    output_file_list = []
         | 
| 24 | 
            +
                if log_files_list is None:
         | 
| 25 | 
            +
                    log_files_list = []
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
         | 
| 28 | 
            +
                
         | 
| 29 | 
            +
                # Required arguments
         | 
| 30 | 
            +
                parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                # Optional arguments with defaults matching the GUI app
         | 
| 33 | 
            +
                parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
         | 
| 34 | 
            +
                                   default='Quick image analysis', help='OCR method to use')
         | 
| 35 | 
            +
                parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
         | 
| 36 | 
            +
                                   default='Local', help='PII detection method')
         | 
| 37 | 
            +
                parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
         | 
| 38 | 
            +
                parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
         | 
| 39 | 
            +
                parser.add_argument('--allow_list', help='Path to allow list CSV file')
         | 
| 40 | 
            +
                parser.add_argument('--output_dir', default='output', help='Output directory')
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                args = parser.parse_args()
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                # Ensure output directory exists
         | 
| 45 | 
            +
                ensure_output_folder_exists()
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # Create file object similar to what Gradio provides
         | 
| 48 | 
            +
                file_obj = {"name": args.input_file}
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                # Load allow list if provided
         | 
| 51 | 
            +
                allow_list_df = pd.DataFrame()
         | 
| 52 | 
            +
                if args.allow_list:
         | 
| 53 | 
            +
                    allow_list_df = pd.read_csv(args.allow_list)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                # Get file names
         | 
| 56 | 
            +
                file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                # Initialize empty states for PDF processing    
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                # Prepare PDF/image
         | 
| 61 | 
            +
                output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
         | 
| 62 | 
            +
                    file_obj, args.ocr_method, allow_list_df, latest_file_completed, 
         | 
| 63 | 
            +
                    output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
         | 
| 64 | 
            +
                )
         | 
| 65 | 
            +
                    
         | 
| 66 | 
            +
                output_summary, output_files, output_file_list, latest_file_completed, log_files, \
         | 
| 67 | 
            +
                log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
         | 
| 68 | 
            +
                current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
         | 
| 69 | 
            +
                comprehend_query_num = choose_and_run_redactor(
         | 
| 70 | 
            +
                    file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
         | 
| 71 | 
            +
                    chosen_comprehend_entities, args.ocr_method, allow_list_df,
         | 
| 72 | 
            +
                    latest_file_completed, output_summary, output_file_list, log_files_list,
         | 
| 73 | 
            +
                    first_loop_state, args.page_min, args.page_max, estimated_time,
         | 
| 74 | 
            +
                    handwrite_signature_checkbox, textract_metadata, all_image_annotations,
         | 
| 75 | 
            +
                    all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
         | 
| 76 | 
            +
                    current_loop_page, page_break, args.pii_detector, comprehend_query_num
         | 
| 77 | 
            +
                )
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                print(f"\nRedaction complete. Output summary:\n{output_summary}")
         | 
| 80 | 
            +
                print(f"\nOutput files saved to: {args.output_dir}")
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            if __name__ == "__main__":
         | 
| 83 | 
            +
                main() 
         | 
    	
        tools/file_conversion.py
    CHANGED
    
    | @@ -9,6 +9,7 @@ import gradio as gr | |
| 9 | 
             
            import time
         | 
| 10 | 
             
            import json
         | 
| 11 | 
             
            import pymupdf
         | 
|  | |
| 12 | 
             
            from gradio import Progress
         | 
| 13 | 
             
            from typing import List, Optional
         | 
| 14 |  | 
| @@ -47,6 +48,8 @@ def is_pdf(filename): | |
| 47 |  | 
| 48 | 
             
            def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
         | 
| 49 |  | 
|  | |
|  | |
| 50 | 
             
                # Get the number of pages in the PDF
         | 
| 51 | 
             
                page_count = pdfinfo_from_path(pdf_path)['Pages']
         | 
| 52 | 
             
                print("Number of pages in PDF: ", str(page_count))
         | 
| @@ -55,7 +58,9 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag | |
| 55 |  | 
| 56 | 
             
                # Open the PDF file
         | 
| 57 | 
             
                #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
         | 
| 58 | 
            -
                for page_num in  | 
|  | |
|  | |
| 59 |  | 
| 60 | 
             
                    print("Converting page: ", str(page_num + 1))
         | 
| 61 |  | 
| @@ -98,7 +103,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag | |
| 98 | 
             
                return images
         | 
| 99 |  | 
| 100 | 
             
            # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
         | 
| 101 | 
            -
            def process_file(file_path):
         | 
| 102 | 
             
                # Get the file extension
         | 
| 103 | 
             
                file_extension = os.path.splitext(file_path)[1].lower()
         | 
| 104 |  | 
| @@ -130,7 +135,9 @@ def get_input_file_names(file_input): | |
| 130 | 
             
                file_name_with_extension = ""
         | 
| 131 | 
             
                full_file_name = ""
         | 
| 132 |  | 
| 133 | 
            -
                 | 
|  | |
|  | |
| 134 |  | 
| 135 | 
             
                if isinstance(file_input, str):
         | 
| 136 | 
             
                    file_input_list = [file_input]
         | 
| @@ -225,6 +232,9 @@ def prepare_image_or_pdf( | |
| 225 | 
             
                if not file_paths:
         | 
| 226 | 
             
                    file_paths = []
         | 
| 227 |  | 
|  | |
|  | |
|  | |
| 228 | 
             
                if isinstance(file_paths, str):
         | 
| 229 | 
             
                    file_path_number = 1
         | 
| 230 | 
             
                else:
         | 
| @@ -277,8 +287,9 @@ def prepare_image_or_pdf( | |
| 277 |  | 
| 278 | 
             
                    file_extension = os.path.splitext(file_path)[1].lower()
         | 
| 279 |  | 
| 280 | 
            -
             | 
| 281 | 
            -
                    if  | 
|  | |
| 282 | 
             
                        in_redact_method = tesseract_ocr_option
         | 
| 283 |  | 
| 284 |  | 
| @@ -333,6 +344,9 @@ def prepare_image_or_pdf( | |
| 333 | 
             
                                json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
         | 
| 334 | 
             
                            continue
         | 
| 335 |  | 
|  | |
|  | |
|  | |
| 336 | 
             
                    # Convert pdf/image file to correct format for redaction
         | 
| 337 | 
             
                    if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
         | 
| 338 | 
             
                        if is_pdf_or_image(file_path) == False:
         | 
| @@ -340,6 +354,9 @@ def prepare_image_or_pdf( | |
| 340 | 
             
                            print(out_message)
         | 
| 341 | 
             
                            return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
         | 
| 342 |  | 
|  | |
|  | |
|  | |
| 343 | 
             
                        converted_file_path = process_file(file_path)
         | 
| 344 | 
             
                        image_file_path = converted_file_path
         | 
| 345 |  | 
|  | |
| 9 | 
             
            import time
         | 
| 10 | 
             
            import json
         | 
| 11 | 
             
            import pymupdf
         | 
| 12 | 
            +
            from tqdm import tqdm
         | 
| 13 | 
             
            from gradio import Progress
         | 
| 14 | 
             
            from typing import List, Optional
         | 
| 15 |  | 
|  | |
| 48 |  | 
| 49 | 
             
            def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
         | 
| 50 |  | 
| 51 | 
            +
                print("pdf_path in convert_pdf_to_images:", pdf_path)
         | 
| 52 | 
            +
             | 
| 53 | 
             
                # Get the number of pages in the PDF
         | 
| 54 | 
             
                page_count = pdfinfo_from_path(pdf_path)['Pages']
         | 
| 55 | 
             
                print("Number of pages in PDF: ", str(page_count))
         | 
|  | |
| 58 |  | 
| 59 | 
             
                # Open the PDF file
         | 
| 60 | 
             
                #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
         | 
| 61 | 
            +
                for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    print("page_num in convert_pdf_to_images:", page_num)
         | 
| 64 |  | 
| 65 | 
             
                    print("Converting page: ", str(page_num + 1))
         | 
| 66 |  | 
|  | |
| 103 | 
             
                return images
         | 
| 104 |  | 
| 105 | 
             
            # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
         | 
| 106 | 
            +
            def process_file(file_path:str):
         | 
| 107 | 
             
                # Get the file extension
         | 
| 108 | 
             
                file_extension = os.path.splitext(file_path)[1].lower()
         | 
| 109 |  | 
|  | |
| 135 | 
             
                file_name_with_extension = ""
         | 
| 136 | 
             
                full_file_name = ""
         | 
| 137 |  | 
| 138 | 
            +
                print("file_input in input file names:", file_input)
         | 
| 139 | 
            +
                if isinstance(file_input, dict):
         | 
| 140 | 
            +
                    file_input = os.path.abspath(file_input["name"])
         | 
| 141 |  | 
| 142 | 
             
                if isinstance(file_input, str):
         | 
| 143 | 
             
                    file_input_list = [file_input]
         | 
|  | |
| 232 | 
             
                if not file_paths:
         | 
| 233 | 
             
                    file_paths = []
         | 
| 234 |  | 
| 235 | 
            +
                if isinstance(file_paths, dict):
         | 
| 236 | 
            +
                    file_paths = os.path.abspath(file_paths["name"])
         | 
| 237 | 
            +
             | 
| 238 | 
             
                if isinstance(file_paths, str):
         | 
| 239 | 
             
                    file_path_number = 1
         | 
| 240 | 
             
                else:
         | 
|  | |
| 287 |  | 
| 288 | 
             
                    file_extension = os.path.splitext(file_path)[1].lower()
         | 
| 289 |  | 
| 290 | 
            +
             | 
| 291 | 
            +
                    # Check if the file is an image type and the user selected text ocr option
         | 
| 292 | 
            +
                    if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
         | 
| 293 | 
             
                        in_redact_method = tesseract_ocr_option
         | 
| 294 |  | 
| 295 |  | 
|  | |
| 344 | 
             
                                json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
         | 
| 345 | 
             
                            continue
         | 
| 346 |  | 
| 347 | 
            +
                    
         | 
| 348 | 
            +
                    print("in_redact_method:", in_redact_method)
         | 
| 349 | 
            +
             | 
| 350 | 
             
                    # Convert pdf/image file to correct format for redaction
         | 
| 351 | 
             
                    if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
         | 
| 352 | 
             
                        if is_pdf_or_image(file_path) == False:
         | 
|  | |
| 354 | 
             
                            print(out_message)
         | 
| 355 | 
             
                            return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
         | 
| 356 |  | 
| 357 | 
            +
                        print("In correct preparation area.")
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                        print("file_path at process_file:", file_path)
         | 
| 360 | 
             
                        converted_file_path = process_file(file_path)
         | 
| 361 | 
             
                        image_file_path = converted_file_path
         | 
| 362 |  | 
    	
        tools/file_redaction.py
    CHANGED
    
    | @@ -180,8 +180,12 @@ def choose_and_run_redactor(file_paths:List[str], | |
| 180 | 
             
                    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         | 
| 181 |  | 
| 182 | 
             
                # Create allow list
         | 
|  | |
|  | |
|  | |
|  | |
| 183 | 
             
                if not in_allow_list.empty:
         | 
| 184 | 
            -
                    in_allow_list_flat = in_allow_list[0].tolist()
         | 
| 185 | 
             
                    print("In allow list:", in_allow_list_flat)
         | 
| 186 | 
             
                else:
         | 
| 187 | 
             
                    in_allow_list_flat = []
         | 
| @@ -215,12 +219,18 @@ def choose_and_run_redactor(file_paths:List[str], | |
| 215 | 
             
                progress(0.5, desc="Redacting file")
         | 
| 216 |  | 
| 217 | 
             
                if isinstance(file_paths, str):
         | 
| 218 | 
            -
                    file_paths_list = [file_paths]
         | 
|  | |
|  | |
|  | |
|  | |
| 219 | 
             
                    file_paths_loop = file_paths_list
         | 
| 220 | 
             
                else:
         | 
| 221 | 
             
                    file_paths_list = file_paths
         | 
| 222 | 
             
                    file_paths_loop = [file_paths_list[int(latest_file_completed)]]    
         | 
| 223 |  | 
|  | |
|  | |
| 224 |  | 
| 225 | 
             
                for file in file_paths_loop:
         | 
| 226 | 
             
                    if isinstance(file, str):
         | 
|  | |
| 180 | 
             
                    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         | 
| 181 |  | 
| 182 | 
             
                # Create allow list
         | 
| 183 | 
            +
                # If string, assume file path
         | 
| 184 | 
            +
                if isinstance(in_allow_list, str):
         | 
| 185 | 
            +
                    in_allow_list = pd.read_csv(in_allow_list)
         | 
| 186 | 
            +
             | 
| 187 | 
             
                if not in_allow_list.empty:
         | 
| 188 | 
            +
                    in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
         | 
| 189 | 
             
                    print("In allow list:", in_allow_list_flat)
         | 
| 190 | 
             
                else:
         | 
| 191 | 
             
                    in_allow_list_flat = []
         | 
|  | |
| 219 | 
             
                progress(0.5, desc="Redacting file")
         | 
| 220 |  | 
| 221 | 
             
                if isinstance(file_paths, str):
         | 
| 222 | 
            +
                    file_paths_list = [os.path.abspath(file_paths)]
         | 
| 223 | 
            +
                    file_paths_loop = file_paths_list
         | 
| 224 | 
            +
                elif isinstance(file_paths, dict):
         | 
| 225 | 
            +
                    file_paths = file_paths["name"]
         | 
| 226 | 
            +
                    file_paths_list = [os.path.abspath(file_paths)]
         | 
| 227 | 
             
                    file_paths_loop = file_paths_list
         | 
| 228 | 
             
                else:
         | 
| 229 | 
             
                    file_paths_list = file_paths
         | 
| 230 | 
             
                    file_paths_loop = [file_paths_list[int(latest_file_completed)]]    
         | 
| 231 |  | 
| 232 | 
            +
                print("file_paths_list in choose_redactor function:", file_paths_list)
         | 
| 233 | 
            +
             | 
| 234 |  | 
| 235 | 
             
                for file in file_paths_loop:
         | 
| 236 | 
             
                    if isinstance(file, str):
         | 
    	
        tools/redaction_review.py
    CHANGED
    
    | @@ -72,7 +72,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo | |
| 72 |  | 
| 73 | 
             
                    return out_image_annotator, number_reported, number_reported
         | 
| 74 |  | 
| 75 | 
            -
                print("page_num at start of update_annotator function:", page_num)
         | 
| 76 |  | 
| 77 | 
             
                if page_num is None:
         | 
| 78 | 
             
                    page_num = 0
         | 
|  | |
| 72 |  | 
| 73 | 
             
                    return out_image_annotator, number_reported, number_reported
         | 
| 74 |  | 
| 75 | 
            +
                #print("page_num at start of update_annotator function:", page_num)
         | 
| 76 |  | 
| 77 | 
             
                if page_num is None:
         | 
| 78 | 
             
                    page_num = 0
         |