Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Apr 16

Commit

c27db98

unverified ·

2 Parent(s): 4a5cee5 d370b1c

Merge pull request #15 from seanpedrick-case/dev

Browse files

Added gradio_image_annotation functionality, upgraded gradio, example logs file

Files changed (6) hide show

app.py +3 -1
load_s3_logs.py +66 -0
requirements.txt +3 -5
tools/file_conversion.py +1 -1
tools/redaction_review.py +2 -1
tools/textract_batch_call.py +21 -16

app.py CHANGED Viewed

@@ -670,7 +670,9 @@ with app:
             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
-    ### LOGGING
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
+    ###
+    # LOGGING
+    ###
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

load_s3_logs.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import boto3
+import pandas as pd
+from io import StringIO
+from datetime import datetime
+from tools.config import DOCUMENT_REDACTION_BUCKET
+# S3 setup
+s3 = boto3.client('s3')
+bucket_name = DOCUMENT_REDACTION_BUCKET
+prefix = 'logs'# 'usage/' # 'feedback/'  # Change as needed - top-level folder where logs are stored
+earliest_date = '20250401' # Earliest date of logs folder retrieved
+latest_date = '20250412' # Latest date of logs folder retrieved
+# Function to list all files in a folder
+def list_files_in_s3(bucket, prefix):
+    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
+    if 'Contents' in response:
+        return [content['Key'] for content in response['Contents']]
+    return []
+# Function to filter date range
+def is_within_date_range(date_str, start_date, end_date):
+    date_obj = datetime.strptime(date_str, '%Y%m%d')
+    return start_date <= date_obj <= end_date
+# Define the date range
+start_date = datetime.strptime('20250401', '%Y%m%d')  # Replace with your start date
+end_date = datetime.strptime('20250412', '%Y%m%d')    # Replace with your end date
+# List all subfolders under 'usage/'
+all_files = list_files_in_s3(bucket_name, prefix)
+# Filter based on date range
+log_files = []
+for file in all_files:
+    parts = file.split('/')
+    if len(parts) >= 3:
+        date_str = parts[1]
+        if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
+            log_files.append(file)
+# Download, read and concatenate CSV files into a pandas DataFrame
+df_list = []
+for log_file in log_files:
+    # Download the file
+    obj = s3.get_object(Bucket=bucket_name, Key=log_file)
+    csv_content = obj['Body'].read().decode('utf-8')
+    # Read CSV content into pandas DataFrame
+    try:
+        df = pd.read_csv(StringIO(csv_content))
+    except Exception as e:
+        print("Could not load in log file:", log_file, "due to:", e)
+        continue
+    df_list.append(df)
+# Concatenate all DataFrames
+if df_list:
+    concatenated_df = pd.concat(df_list, ignore_index=True)
+    # Save the concatenated DataFrame to a CSV file
+    concatenated_df.to_csv('consolidated_logs.csv', index=False)
+    print("Consolidated CSV saved as 'consolidated_logs.csv'")
+else:
+    print("No log files found in the given date range.")

requirements.txt CHANGED Viewed

@@ -11,17 +11,15 @@ scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-#gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
-https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
-#gradio_image_annotation==0.2.5
-# The following version includes rotation and image zoom options
-https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.25.2
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
+# The following version
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.1/gradio_image_annotation-0.3.1-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

tools/file_conversion.py CHANGED Viewed

@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
     # Check for NaN or infinite values in query_coords and filter them out
     finite_mask = np.isfinite(query_coords).all(axis=1)
     if not finite_mask.all():
-        print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
         query_coords = query_coords[finite_mask]  # Filter out rows with NaN or infinite values
     else:
         pass

     # Check for NaN or infinite values in query_coords and filter them out
     finite_mask = np.isfinite(query_coords).all(axis=1)
     if not finite_mask.all():
+        #print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
         query_coords = query_coords[finite_mask]  # Filter out rows with NaN or infinite values
     else:
         pass

tools/redaction_review.py CHANGED Viewed

@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
         show_share_button=False,
         show_remove_button=False,
         handles_cursor=True,
-        interactive=True
         )
         return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations

         show_share_button=False,
         show_remove_button=False,
         handles_cursor=True,
+        interactive=True,
+        use_default_label=True
         )
         return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations

tools/textract_batch_call.py CHANGED Viewed

@@ -10,15 +10,8 @@ from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
-# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
-# MY_S3_INPUT_PREFIX = session_hash_textbox          # Folder in the bucket for uploads
-# MY_S3_OUTPUT_PREFIX = session_hash_textbox        # Folder in the bucket for results
-# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER      # Local folder to save JSON
-# MY_AWS_REGION = AWS_REGION                     # e.g., 'us-east-1', 'eu-west-1'
 from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
-from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     local_pdf_path: str,
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
 def return_job_status(job_id:str,
                      response:dict,
                      attempts:int,
-                     poll_interval_seconds: int = 5,
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
     job_status = response['JobStatus']
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
                                 s3_output_key_prefix:str,
                                 pdf_filename:str,
                                 job_id:str,
-                                local_output_dir:str):
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1 # ~10 minutes total wait time):
     ):
     if job_id:
         # Initialize boto3 clients
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
         # Update Textract document history df
         try:
-            job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                         load_s3_jobs_loc=load_s3_jobs_loc,
                                         load_local_jobs_loc=load_local_jobs_loc)
         except Exception as e:
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
     return downloaded_file_path, job_status, job_df
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
                                      aws_region:str=AWS_REGION):
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
     return job_df
 def download_textract_output(job_id:str,
                              output_bucket:str,
                              output_prefix:str,
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
-        print(f"Error downloading file: {e}")

 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
 from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
+#from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     local_pdf_path: str,
 def return_job_status(job_id:str,
                      response:dict,
                      attempts:int,
+                     poll_interval_seconds: int = 0,
                      max_polling_attempts: int = 1 # ~10 minutes total wait time
                      ):
+    '''
+    Poll Textract for the current status of a previously-submitted job.
+    '''
     job_status = response['JobStatus']
     logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
                                 s3_output_key_prefix:str,
                                 pdf_filename:str,
                                 job_id:str,
+                                local_output_dir:str):
+    '''
+    Download and combine selected job files from the AWS Textract service.
+    '''
     list_response = s3_client.list_objects_v2(
         Bucket=s3_bucket_name,
         Prefix=s3_output_key_prefix
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
     aws_region: str = AWS_REGION, # Optional: specify region if not default
+    load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1 # ~10 minutes total wait time):
     ):
+    '''
+    Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
+    '''
     if job_id:
         # Initialize boto3 clients
         # Update Textract document history df
         try:
+            job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
                                         load_s3_jobs_loc=load_s3_jobs_loc,
                                         load_local_jobs_loc=load_local_jobs_loc)
         except Exception as e:
     return downloaded_file_path, job_status, job_df
 def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
                                      aws_region:str=AWS_REGION):
+    '''
+    Load in a dataframe of jobs previous submitted to the Textract API service.
+    '''
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
     return job_df
 def download_textract_output(job_id:str,
                              output_bucket:str,
                              output_prefix:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
+        print(f"Error downloading file: {e}")