Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 16

Commit

f6e6d80

1 Parent(s): 818efbc

Minor function documentation changes. Requirements update for new Gradio and version of Gradio annotator that allows for saving preferred redaction format and to include box id

Browse files

Files changed (5) hide show

app.py +3 -1
requirements.txt +2 -4
tools/file_conversion.py +1 -1
tools/redaction_review.py +2 -1
tools/textract_batch_call.py +21 -16

app.py CHANGED Viewed

@@ -670,7 +670,9 @@ with app:
             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
-    ### LOGGING
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
+    ###
+    # LOGGING
+    ###
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

requirements.txt CHANGED Viewed

@@ -11,17 +11,15 @@ scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-#gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
-https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
-#gradio_image_annotation==0.2.5
 # The following version includes rotation and image zoom options
-https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.25.2
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version includes rotation and image zoom options
+git+https://github.com/seanpedrick-case/gradio_image_annotator.git@v0.3.1 # This version also has the option to use default labels
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

tools/file_conversion.py CHANGED Viewed

@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
     # Check for NaN or infinite values in query_coords and filter them out
     finite_mask = np.isfinite(query_coords).all(axis=1)
     if not finite_mask.all():
-        print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
         query_coords = query_coords[finite_mask]  # Filter out rows with NaN or infinite values
     else:
         pass

     # Check for NaN or infinite values in query_coords and filter them out
     finite_mask = np.isfinite(query_coords).all(axis=1)
     if not finite_mask.all():
+        #print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
         query_coords = query_coords[finite_mask]  # Filter out rows with NaN or infinite values
     else:
         pass

tools/redaction_review.py CHANGED Viewed

@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
         show_share_button=False,
         show_remove_button=False,
         handles_cursor=True,
-        interactive=True
         )
         return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations

         show_share_button=False,
         show_remove_button=False,
         handles_cursor=True,
+        interactive=True,
+        use_default_label=True
         )
         return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations

tools/textract_batch_call.py CHANGED Viewed

@@ -10,15 +10,8 @@ from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
-# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
-# MY_S3_INPUT_PREFIX = session_hash_textbox          # Folder in the bucket for uploads
-# MY_S3_OUTPUT_PREFIX = session_hash_textbox        # Folder in the bucket for results
-# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER      # Local folder to save JSON
-# MY_AWS_REGION = AWS_REGION                     # e.g., 'us-east-1', 'eu-west-1'
 from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
-from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     local_pdf_path: str,
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
 def return_job_status(job_id:str,
                      response:dict,
                      attempts:int,
-                     poll_interval_seconds: int = 5,
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
     job_status = response['JobStatus']
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
                                 s3_output_key_prefix:str,
                                 pdf_filename:str,
                                 job_id:str,
-                                local_output_dir:str):
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1 # ~10 minutes total wait time):
     ):
     if job_id:
         # Initialize boto3 clients
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
         # Update Textract document history df
         try:
-            job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                         load_s3_jobs_loc=load_s3_jobs_loc,
                                         load_local_jobs_loc=load_local_jobs_loc)
         except Exception as e:
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
     return downloaded_file_path, job_status, job_df
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
                                      aws_region:str=AWS_REGION):
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
     return job_df
 def download_textract_output(job_id:str,
                              output_bucket:str,
                              output_prefix:str,
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
-        print(f"Error downloading file: {e}")

 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
 from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
+#from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     local_pdf_path: str,
 def return_job_status(job_id:str,
                      response:dict,
                      attempts:int,
+                     poll_interval_seconds: int = 0,
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
+    '''
+    Poll Textract for the current status of a previously-submitted job.
+    '''
     job_status = response['JobStatus']
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
                                 s3_output_key_prefix:str,
                                 pdf_filename:str,
                                 job_id:str,
+                                local_output_dir:str):
+    '''
+    Download and combine selected job files from the AWS Textract service.
+    '''
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
+    load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1 # ~10 minutes total wait time):
     ):
+    '''
+    Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
+    '''
     if job_id:
         # Initialize boto3 clients
         # Update Textract document history df
         try:
+            job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
                                         load_s3_jobs_loc=load_s3_jobs_loc,
                                         load_local_jobs_loc=load_local_jobs_loc)
         except Exception as e:
     return downloaded_file_path, job_status, job_df
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
                                      aws_region:str=AWS_REGION):
+    '''
+    Load in a dataframe of jobs previous submitted to the Textract API service.
+    '''
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
     return job_df
 def download_textract_output(job_id:str,
                              output_bucket:str,
                              output_prefix:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
+        print(f"Error downloading file: {e}")