Commit
·
f6e6d80
1
Parent(s):
818efbc
Minor function documentation changes. Requirements update for new Gradio and version of Gradio annotator that allows for saving preferred redaction format and to include box id
Browse files- app.py +3 -1
- requirements.txt +2 -4
- tools/file_conversion.py +1 -1
- tools/redaction_review.py +2 -1
- tools/textract_batch_call.py +21 -16
app.py
CHANGED
@@ -670,7 +670,9 @@ with app:
|
|
670 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
671 |
else: print("Could not load in cost code data")
|
672 |
|
673 |
-
###
|
|
|
|
|
674 |
|
675 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
676 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
670 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
671 |
else: print("Could not load in cost code data")
|
672 |
|
673 |
+
###
|
674 |
+
# LOGGING
|
675 |
+
###
|
676 |
|
677 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
678 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
requirements.txt
CHANGED
@@ -11,17 +11,15 @@ scikit-learn==1.6.1
|
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
-
|
15 |
-
https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
|
16 |
boto3==1.37.29
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
-
#gradio_image_annotation==0.2.5
|
23 |
# The following version includes rotation and image zoom options
|
24 |
-
https://github.com/seanpedrick-case/gradio_image_annotator
|
25 |
rapidfuzz==3.12.1
|
26 |
python-dotenv==1.0.1
|
27 |
numpy==1.26.4
|
|
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
+
gradio==5.25.2
|
|
|
15 |
boto3==1.37.29
|
16 |
pyarrow==19.0.1
|
17 |
openpyxl==3.1.5
|
18 |
Faker==36.1.1
|
19 |
python-levenshtein==0.26.1
|
20 |
spaczz==0.6.1
|
|
|
21 |
# The following version includes rotation and image zoom options
|
22 |
+
git+https://github.com/seanpedrick-case/gradio_image_annotator.git@v0.3.1 # This version also has the option to use default labels
|
23 |
rapidfuzz==3.12.1
|
24 |
python-dotenv==1.0.1
|
25 |
numpy==1.26.4
|
tools/file_conversion.py
CHANGED
@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
|
|
1036 |
# Check for NaN or infinite values in query_coords and filter them out
|
1037 |
finite_mask = np.isfinite(query_coords).all(axis=1)
|
1038 |
if not finite_mask.all():
|
1039 |
-
print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
|
1040 |
query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
|
1041 |
else:
|
1042 |
pass
|
|
|
1036 |
# Check for NaN or infinite values in query_coords and filter them out
|
1037 |
finite_mask = np.isfinite(query_coords).all(axis=1)
|
1038 |
if not finite_mask.all():
|
1039 |
+
#print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
|
1040 |
query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
|
1041 |
else:
|
1042 |
pass
|
tools/redaction_review.py
CHANGED
@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
|
|
351 |
show_share_button=False,
|
352 |
show_remove_button=False,
|
353 |
handles_cursor=True,
|
354 |
-
interactive=True
|
|
|
355 |
)
|
356 |
|
357 |
return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
|
|
|
351 |
show_share_button=False,
|
352 |
show_remove_button=False,
|
353 |
handles_cursor=True,
|
354 |
+
interactive=True,
|
355 |
+
use_default_label=True
|
356 |
)
|
357 |
|
358 |
return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
|
tools/textract_batch_call.py
CHANGED
@@ -10,15 +10,8 @@ from io import StringIO
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
13 |
-
# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
|
14 |
-
# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
|
15 |
-
# MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
|
16 |
-
# MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
|
17 |
-
# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
|
18 |
-
# MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
|
19 |
from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
20 |
-
from tools.aws_textract import json_to_ocrresult
|
21 |
-
|
22 |
|
23 |
def analyse_document_with_textract_api(
|
24 |
local_pdf_path: str,
|
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
|
|
202 |
def return_job_status(job_id:str,
|
203 |
response:dict,
|
204 |
attempts:int,
|
205 |
-
poll_interval_seconds: int =
|
206 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
207 |
):
|
|
|
|
|
|
|
|
|
208 |
job_status = response['JobStatus']
|
209 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
210 |
|
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
|
|
232 |
s3_output_key_prefix:str,
|
233 |
pdf_filename:str,
|
234 |
job_id:str,
|
235 |
-
local_output_dir:str):
|
|
|
|
|
|
|
|
|
236 |
list_response = s3_client.list_objects_v2(
|
237 |
Bucket=s3_bucket_name,
|
238 |
Prefix=s3_output_key_prefix
|
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
329 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
330 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
331 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
|
|
332 |
poll_interval_seconds: int = 1,
|
333 |
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
334 |
):
|
|
|
|
|
|
|
335 |
|
336 |
if job_id:
|
337 |
# Initialize boto3 clients
|
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
349 |
|
350 |
# Update Textract document history df
|
351 |
try:
|
352 |
-
job_df = load_in_textract_job_details(load_s3_jobs=
|
353 |
load_s3_jobs_loc=load_s3_jobs_loc,
|
354 |
load_local_jobs_loc=load_local_jobs_loc)
|
355 |
except Exception as e:
|
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
431 |
|
432 |
return downloaded_file_path, job_status, job_df
|
433 |
|
434 |
-
|
435 |
-
|
436 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
437 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
438 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
439 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
440 |
aws_region:str=AWS_REGION):
|
441 |
-
|
|
|
|
|
|
|
442 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
443 |
|
444 |
# Initialize boto3 clients
|
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
478 |
|
479 |
return job_df
|
480 |
|
481 |
-
|
482 |
def download_textract_output(job_id:str,
|
483 |
output_bucket:str,
|
484 |
output_prefix:str,
|
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
|
|
518 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
519 |
print(f"Output file downloaded to: {local_file_path}")
|
520 |
except Exception as e:
|
521 |
-
print(f"Error downloading file: {e}")
|
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
14 |
+
#from tools.aws_textract import json_to_ocrresult
|
|
|
15 |
|
16 |
def analyse_document_with_textract_api(
|
17 |
local_pdf_path: str,
|
|
|
195 |
def return_job_status(job_id:str,
|
196 |
response:dict,
|
197 |
attempts:int,
|
198 |
+
poll_interval_seconds: int = 0,
|
199 |
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
200 |
):
|
201 |
+
'''
|
202 |
+
Poll Textract for the current status of a previously-submitted job.
|
203 |
+
'''
|
204 |
+
|
205 |
job_status = response['JobStatus']
|
206 |
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
207 |
|
|
|
229 |
s3_output_key_prefix:str,
|
230 |
pdf_filename:str,
|
231 |
job_id:str,
|
232 |
+
local_output_dir:str):
|
233 |
+
'''
|
234 |
+
Download and combine selected job files from the AWS Textract service.
|
235 |
+
'''
|
236 |
+
|
237 |
list_response = s3_client.list_objects_v2(
|
238 |
Bucket=s3_bucket_name,
|
239 |
Prefix=s3_output_key_prefix
|
|
|
330 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
331 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
332 |
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
333 |
+
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
334 |
poll_interval_seconds: int = 1,
|
335 |
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
336 |
):
|
337 |
+
'''
|
338 |
+
Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
|
339 |
+
'''
|
340 |
|
341 |
if job_id:
|
342 |
# Initialize boto3 clients
|
|
|
354 |
|
355 |
# Update Textract document history df
|
356 |
try:
|
357 |
+
job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
|
358 |
load_s3_jobs_loc=load_s3_jobs_loc,
|
359 |
load_local_jobs_loc=load_local_jobs_loc)
|
360 |
except Exception as e:
|
|
|
436 |
|
437 |
return downloaded_file_path, job_status, job_df
|
438 |
|
|
|
|
|
439 |
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
440 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
441 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
442 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
443 |
aws_region:str=AWS_REGION):
|
444 |
+
'''
|
445 |
+
Load in a dataframe of jobs previous submitted to the Textract API service.
|
446 |
+
'''
|
447 |
+
|
448 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
449 |
|
450 |
# Initialize boto3 clients
|
|
|
484 |
|
485 |
return job_df
|
486 |
|
|
|
487 |
def download_textract_output(job_id:str,
|
488 |
output_bucket:str,
|
489 |
output_prefix:str,
|
|
|
523 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
524 |
print(f"Output file downloaded to: {local_file_path}")
|
525 |
except Exception as e:
|
526 |
+
print(f"Error downloading file: {e}")
|