Merge pull request #15 from seanpedrick-case/dev
Browse filesAdded gradio_image_annotation functionality, upgraded gradio, example logs file
- app.py +3 -1
- load_s3_logs.py +66 -0
- requirements.txt +3 -5
- tools/file_conversion.py +1 -1
- tools/redaction_review.py +2 -1
- tools/textract_batch_call.py +21 -16
app.py
CHANGED
@@ -670,7 +670,9 @@ with app:
|
|
670 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
671 |
else: print("Could not load in cost code data")
|
672 |
|
673 |
-
###
|
|
|
|
|
674 |
|
675 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
676 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
670 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
671 |
else: print("Could not load in cost code data")
|
672 |
|
673 |
+
###
|
674 |
+
# LOGGING
|
675 |
+
###
|
676 |
|
677 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
678 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
load_s3_logs.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
from datetime import datetime
|
5 |
+
from tools.config import DOCUMENT_REDACTION_BUCKET
|
6 |
+
|
7 |
+
# S3 setup
|
8 |
+
s3 = boto3.client('s3')
|
9 |
+
bucket_name = DOCUMENT_REDACTION_BUCKET
|
10 |
+
prefix = 'logs'# 'usage/' # 'feedback/' # Change as needed - top-level folder where logs are stored
|
11 |
+
earliest_date = '20250401' # Earliest date of logs folder retrieved
|
12 |
+
latest_date = '20250412' # Latest date of logs folder retrieved
|
13 |
+
|
14 |
+
# Function to list all files in a folder
|
15 |
+
def list_files_in_s3(bucket, prefix):
|
16 |
+
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
|
17 |
+
if 'Contents' in response:
|
18 |
+
return [content['Key'] for content in response['Contents']]
|
19 |
+
return []
|
20 |
+
|
21 |
+
# Function to filter date range
|
22 |
+
def is_within_date_range(date_str, start_date, end_date):
|
23 |
+
date_obj = datetime.strptime(date_str, '%Y%m%d')
|
24 |
+
return start_date <= date_obj <= end_date
|
25 |
+
|
26 |
+
# Define the date range
|
27 |
+
start_date = datetime.strptime('20250401', '%Y%m%d') # Replace with your start date
|
28 |
+
end_date = datetime.strptime('20250412', '%Y%m%d') # Replace with your end date
|
29 |
+
|
30 |
+
# List all subfolders under 'usage/'
|
31 |
+
all_files = list_files_in_s3(bucket_name, prefix)
|
32 |
+
|
33 |
+
# Filter based on date range
|
34 |
+
log_files = []
|
35 |
+
for file in all_files:
|
36 |
+
parts = file.split('/')
|
37 |
+
if len(parts) >= 3:
|
38 |
+
date_str = parts[1]
|
39 |
+
if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
|
40 |
+
log_files.append(file)
|
41 |
+
|
42 |
+
# Download, read and concatenate CSV files into a pandas DataFrame
|
43 |
+
df_list = []
|
44 |
+
for log_file in log_files:
|
45 |
+
# Download the file
|
46 |
+
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
47 |
+
csv_content = obj['Body'].read().decode('utf-8')
|
48 |
+
|
49 |
+
# Read CSV content into pandas DataFrame
|
50 |
+
try:
|
51 |
+
df = pd.read_csv(StringIO(csv_content))
|
52 |
+
except Exception as e:
|
53 |
+
print("Could not load in log file:", log_file, "due to:", e)
|
54 |
+
continue
|
55 |
+
|
56 |
+
df_list.append(df)
|
57 |
+
|
58 |
+
# Concatenate all DataFrames
|
59 |
+
if df_list:
|
60 |
+
concatenated_df = pd.concat(df_list, ignore_index=True)
|
61 |
+
|
62 |
+
# Save the concatenated DataFrame to a CSV file
|
63 |
+
concatenated_df.to_csv('consolidated_logs.csv', index=False)
|
64 |
+
print("Consolidated CSV saved as 'consolidated_logs.csv'")
|
65 |
+
else:
|
66 |
+
print("No log files found in the given date range.")
|
requirements.txt
CHANGED
@@ -11,17 +11,15 @@ scikit-learn==1.6.1
|
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
-
|
15 |
-
https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
|
16 |
boto3==1.37.29
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
-
#
|
23 |
-
#
|
24 |
-
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
|
25 |
rapidfuzz==3.12.1
|
26 |
python-dotenv==1.0.1
|
27 |
numpy==1.26.4
|
|
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
+
gradio==5.25.2
|
|
|
15 |
boto3==1.37.29
|
16 |
pyarrow==19.0.1
|
17 |
openpyxl==3.1.5
|
18 |
Faker==36.1.1
|
19 |
python-levenshtein==0.26.1
|
20 |
spaczz==0.6.1
|
21 |
+
# The following version
|
22 |
+
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.1/gradio_image_annotation-0.3.1-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
|
|
23 |
rapidfuzz==3.12.1
|
24 |
python-dotenv==1.0.1
|
25 |
numpy==1.26.4
|
tools/file_conversion.py
CHANGED
@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
|
|
1036 |
# Check for NaN or infinite values in query_coords and filter them out
|
1037 |
finite_mask = np.isfinite(query_coords).all(axis=1)
|
1038 |
if not finite_mask.all():
|
1039 |
-
print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
|
1040 |
query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
|
1041 |
else:
|
1042 |
pass
|
|
|
1036 |
# Check for NaN or infinite values in query_coords and filter them out
|
1037 |
finite_mask = np.isfinite(query_coords).all(axis=1)
|
1038 |
if not finite_mask.all():
|
1039 |
+
#print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
|
1040 |
query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
|
1041 |
else:
|
1042 |
pass
|
tools/redaction_review.py
CHANGED
@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
|
|
351 |
show_share_button=False,
|
352 |
show_remove_button=False,
|
353 |
handles_cursor=True,
|
354 |
-
interactive=True
|
|
|
355 |
)
|
356 |
|
357 |
return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
|
|
|
351 |
show_share_button=False,
|
352 |
show_remove_button=False,
|
353 |
handles_cursor=True,
|
354 |
+
interactive=True,
|
355 |
+
use_default_label=True
|
356 |
)
|
357 |
|
358 |
return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
|
tools/textract_batch_call.py
CHANGED
@@ -10,15 +10,8 @@ from io import StringIO
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
13 |
-
# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
|
14 |
-
# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
|
15 |
-
# MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
|
16 |
-
# MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
|
17 |
-
# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
|
18 |
-
# MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
|
19 |
from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
20 |
-
from tools.aws_textract import json_to_ocrresult
|
21 |
-
|
22 |
|
23 |
def analyse_document_with_textract_api(
|
24 |
local_pdf_path: str,
|
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
|
|
202 |
def return_job_status(job_id:str,
|
203 |
response:dict,
|
204 |
attempts:int,
|
205 |
-
poll_interval_seconds: int =
|
206 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
207 |
):
|
|
|
|
|
|
|
|
|
208 |
job_status = response['JobStatus']
|
209 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
210 |
|
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
|
|
232 |
s3_output_key_prefix:str,
|
233 |
pdf_filename:str,
|
234 |
job_id:str,
|
235 |
-
local_output_dir:str):
|
|
|
|
|
|
|
|
|
236 |
list_response = s3_client.list_objects_v2(
|
237 |
Bucket=s3_bucket_name,
|
238 |
Prefix=s3_output_key_prefix
|
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
329 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
330 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
331 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
|
|
332 |
poll_interval_seconds: int = 1,
|
333 |
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
334 |
):
|
|
|
|
|
|
|
335 |
|
336 |
if job_id:
|
337 |
# Initialize boto3 clients
|
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
349 |
|
350 |
# Update Textract document history df
|
351 |
try:
|
352 |
-
job_df = load_in_textract_job_details(load_s3_jobs=
|
353 |
load_s3_jobs_loc=load_s3_jobs_loc,
|
354 |
load_local_jobs_loc=load_local_jobs_loc)
|
355 |
except Exception as e:
|
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
431 |
|
432 |
return downloaded_file_path, job_status, job_df
|
433 |
|
434 |
-
|
435 |
-
|
436 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
437 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
438 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
439 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
440 |
aws_region:str=AWS_REGION):
|
441 |
-
|
|
|
|
|
|
|
442 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
443 |
|
444 |
# Initialize boto3 clients
|
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
478 |
|
479 |
return job_df
|
480 |
|
481 |
-
|
482 |
def download_textract_output(job_id:str,
|
483 |
output_bucket:str,
|
484 |
output_prefix:str,
|
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
|
|
518 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
519 |
print(f"Output file downloaded to: {local_file_path}")
|
520 |
except Exception as e:
|
521 |
-
print(f"Error downloading file: {e}")
|
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
14 |
+
#from tools.aws_textract import json_to_ocrresult
|
|
|
15 |
|
16 |
def analyse_document_with_textract_api(
|
17 |
local_pdf_path: str,
|
|
|
195 |
def return_job_status(job_id:str,
|
196 |
response:dict,
|
197 |
attempts:int,
|
198 |
+
poll_interval_seconds: int = 0,
|
199 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
200 |
):
|
201 |
+
'''
|
202 |
+
Poll Textract for the current status of a previously-submitted job.
|
203 |
+
'''
|
204 |
+
|
205 |
job_status = response['JobStatus']
|
206 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
207 |
|
|
|
229 |
s3_output_key_prefix:str,
|
230 |
pdf_filename:str,
|
231 |
job_id:str,
|
232 |
+
local_output_dir:str):
|
233 |
+
'''
|
234 |
+
Download and combine selected job files from the AWS Textract service.
|
235 |
+
'''
|
236 |
+
|
237 |
list_response = s3_client.list_objects_v2(
|
238 |
Bucket=s3_bucket_name,
|
239 |
Prefix=s3_output_key_prefix
|
|
|
330 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
331 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
332 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
333 |
+
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
334 |
poll_interval_seconds: int = 1,
|
335 |
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
336 |
):
|
337 |
+
'''
|
338 |
+
Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
|
339 |
+
'''
|
340 |
|
341 |
if job_id:
|
342 |
# Initialize boto3 clients
|
|
|
354 |
|
355 |
# Update Textract document history df
|
356 |
try:
|
357 |
+
job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
|
358 |
load_s3_jobs_loc=load_s3_jobs_loc,
|
359 |
load_local_jobs_loc=load_local_jobs_loc)
|
360 |
except Exception as e:
|
|
|
436 |
|
437 |
return downloaded_file_path, job_status, job_df
|
438 |
|
|
|
|
|
439 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
440 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
441 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
442 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
443 |
aws_region:str=AWS_REGION):
|
444 |
+
'''
|
445 |
+
Load in a dataframe of jobs previous submitted to the Textract API service.
|
446 |
+
'''
|
447 |
+
|
448 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
449 |
|
450 |
# Initialize boto3 clients
|
|
|
484 |
|
485 |
return job_df
|
486 |
|
|
|
487 |
def download_textract_output(job_id:str,
|
488 |
output_bucket:str,
|
489 |
output_prefix:str,
|
|
|
523 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
524 |
print(f"Output file downloaded to: {local_file_path}")
|
525 |
except Exception as e:
|
526 |
+
print(f"Error downloading file: {e}")
|