seanpedrickcase commited on
Commit
f6e6d80
·
1 Parent(s): 818efbc

Minor function documentation changes. Requirements update for new Gradio and version of Gradio annotator that allows for saving preferred redaction format and to include box id

Browse files
app.py CHANGED
@@ -670,7 +670,9 @@ with app:
670
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
671
  else: print("Could not load in cost code data")
672
 
673
- ### LOGGING
 
 
674
 
675
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
676
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
670
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
671
  else: print("Could not load in cost code data")
672
 
673
+ ###
674
+ # LOGGING
675
+ ###
676
 
677
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
678
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
requirements.txt CHANGED
@@ -11,17 +11,15 @@ scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
- #gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
15
- https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
16
  boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
- #gradio_image_annotation==0.2.5
23
  # The following version includes rotation and image zoom options
24
- https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
25
  rapidfuzz==3.12.1
26
  python-dotenv==1.0.1
27
  numpy==1.26.4
 
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
+ gradio==5.25.2
 
15
  boto3==1.37.29
16
  pyarrow==19.0.1
17
  openpyxl==3.1.5
18
  Faker==36.1.1
19
  python-levenshtein==0.26.1
20
  spaczz==0.6.1
 
21
  # The following version includes rotation and image zoom options
22
+ git+https://github.com/seanpedrick-case/gradio_image_annotator.git@v0.3.1 # This version also has the option to use default labels
23
  rapidfuzz==3.12.1
24
  python-dotenv==1.0.1
25
  numpy==1.26.4
tools/file_conversion.py CHANGED
@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
1036
  # Check for NaN or infinite values in query_coords and filter them out
1037
  finite_mask = np.isfinite(query_coords).all(axis=1)
1038
  if not finite_mask.all():
1039
- print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1040
  query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1041
  else:
1042
  pass
 
1036
  # Check for NaN or infinite values in query_coords and filter them out
1037
  finite_mask = np.isfinite(query_coords).all(axis=1)
1038
  if not finite_mask.all():
1039
+ #print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1040
  query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1041
  else:
1042
  pass
tools/redaction_review.py CHANGED
@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
351
  show_share_button=False,
352
  show_remove_button=False,
353
  handles_cursor=True,
354
- interactive=True
 
355
  )
356
 
357
  return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
 
351
  show_share_button=False,
352
  show_remove_button=False,
353
  handles_cursor=True,
354
+ interactive=True,
355
+ use_default_label=True
356
  )
357
 
358
  return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
tools/textract_batch_call.py CHANGED
@@ -10,15 +10,8 @@ from io import StringIO
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
- # MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
14
- # MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
15
- # MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
16
- # MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
17
- # MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
18
- # MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
19
  from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
20
- from tools.aws_textract import json_to_ocrresult
21
-
22
 
23
  def analyse_document_with_textract_api(
24
  local_pdf_path: str,
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
202
  def return_job_status(job_id:str,
203
  response:dict,
204
  attempts:int,
205
- poll_interval_seconds: int = 5,
206
  max_polling_attempts: int = 1 # ~10 minutes total wait time
207
  ):
 
 
 
 
208
  job_status = response['JobStatus']
209
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
210
 
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
232
  s3_output_key_prefix:str,
233
  pdf_filename:str,
234
  job_id:str,
235
- local_output_dir:str):
 
 
 
 
236
  list_response = s3_client.list_objects_v2(
237
  Bucket=s3_bucket_name,
238
  Prefix=s3_output_key_prefix
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
329
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
330
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
331
  aws_region: str = AWS_REGION, # Optional: specify region if not default
 
332
  poll_interval_seconds: int = 1,
333
  max_polling_attempts: int = 1 # ~10 minutes total wait time):
334
  ):
 
 
 
335
 
336
  if job_id:
337
  # Initialize boto3 clients
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
349
 
350
  # Update Textract document history df
351
  try:
352
- job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
353
  load_s3_jobs_loc=load_s3_jobs_loc,
354
  load_local_jobs_loc=load_local_jobs_loc)
355
  except Exception as e:
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
431
 
432
  return downloaded_file_path, job_status, job_df
433
 
434
-
435
-
436
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
437
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
438
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
439
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
440
  aws_region:str=AWS_REGION):
441
-
 
 
 
442
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
443
 
444
  # Initialize boto3 clients
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
478
 
479
  return job_df
480
 
481
-
482
  def download_textract_output(job_id:str,
483
  output_bucket:str,
484
  output_prefix:str,
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
518
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
519
  print(f"Output file downloaded to: {local_file_path}")
520
  except Exception as e:
521
- print(f"Error downloading file: {e}")
 
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
 
 
 
 
 
 
13
  from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
14
+ #from tools.aws_textract import json_to_ocrresult
 
15
 
16
  def analyse_document_with_textract_api(
17
  local_pdf_path: str,
 
195
  def return_job_status(job_id:str,
196
  response:dict,
197
  attempts:int,
198
+ poll_interval_seconds: int = 0,
199
  max_polling_attempts: int = 1 # ~10 minutes total wait time
200
  ):
201
+ '''
202
+ Poll Textract for the current status of a previously-submitted job.
203
+ '''
204
+
205
  job_status = response['JobStatus']
206
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
207
 
 
229
  s3_output_key_prefix:str,
230
  pdf_filename:str,
231
  job_id:str,
232
+ local_output_dir:str):
233
+ '''
234
+ Download and combine selected job files from the AWS Textract service.
235
+ '''
236
+
237
  list_response = s3_client.list_objects_v2(
238
  Bucket=s3_bucket_name,
239
  Prefix=s3_output_key_prefix
 
330
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
331
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
332
  aws_region: str = AWS_REGION, # Optional: specify region if not default
333
+ load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
334
  poll_interval_seconds: int = 1,
335
  max_polling_attempts: int = 1 # ~10 minutes total wait time):
336
  ):
337
+ '''
338
+ Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
339
+ '''
340
 
341
  if job_id:
342
  # Initialize boto3 clients
 
354
 
355
  # Update Textract document history df
356
  try:
357
+ job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
358
  load_s3_jobs_loc=load_s3_jobs_loc,
359
  load_local_jobs_loc=load_local_jobs_loc)
360
  except Exception as e:
 
436
 
437
  return downloaded_file_path, job_status, job_df
438
 
 
 
439
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
440
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
441
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
442
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
443
  aws_region:str=AWS_REGION):
444
+ '''
445
+ Load in a dataframe of jobs previous submitted to the Textract API service.
446
+ '''
447
+
448
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
449
 
450
  # Initialize boto3 clients
 
484
 
485
  return job_df
486
 
 
487
  def download_textract_output(job_id:str,
488
  output_bucket:str,
489
  output_prefix:str,
 
523
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
524
  print(f"Output file downloaded to: {local_file_path}")
525
  except Exception as e:
526
+ print(f"Error downloading file: {e}")