Sean Pedrick-Case commited on
Commit
c27db98
·
unverified ·
2 Parent(s): 4a5cee5 d370b1c

Merge pull request #15 from seanpedrick-case/dev

Browse files

Added gradio_image_annotation functionality, upgraded gradio, example logs file

app.py CHANGED
@@ -670,7 +670,9 @@ with app:
670
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
671
  else: print("Could not load in cost code data")
672
 
673
- ### LOGGING
 
 
674
 
675
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
676
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
670
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
671
  else: print("Could not load in cost code data")
672
 
673
+ ###
674
+ # LOGGING
675
+ ###
676
 
677
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
678
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
load_s3_logs.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from datetime import datetime
5
+ from tools.config import DOCUMENT_REDACTION_BUCKET
6
+
7
+ # S3 setup
8
+ s3 = boto3.client('s3')
9
+ bucket_name = DOCUMENT_REDACTION_BUCKET
10
+ prefix = 'logs'# 'usage/' # 'feedback/' # Change as needed - top-level folder where logs are stored
11
+ earliest_date = '20250401' # Earliest date of logs folder retrieved
12
+ latest_date = '20250412' # Latest date of logs folder retrieved
13
+
14
+ # Function to list all files in a folder
15
+ def list_files_in_s3(bucket, prefix):
16
+ response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
17
+ if 'Contents' in response:
18
+ return [content['Key'] for content in response['Contents']]
19
+ return []
20
+
21
+ # Function to filter date range
22
+ def is_within_date_range(date_str, start_date, end_date):
23
+ date_obj = datetime.strptime(date_str, '%Y%m%d')
24
+ return start_date <= date_obj <= end_date
25
+
26
+ # Define the date range
27
+ start_date = datetime.strptime('20250401', '%Y%m%d') # Replace with your start date
28
+ end_date = datetime.strptime('20250412', '%Y%m%d') # Replace with your end date
29
+
30
+ # List all subfolders under 'usage/'
31
+ all_files = list_files_in_s3(bucket_name, prefix)
32
+
33
+ # Filter based on date range
34
+ log_files = []
35
+ for file in all_files:
36
+ parts = file.split('/')
37
+ if len(parts) >= 3:
38
+ date_str = parts[1]
39
+ if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
40
+ log_files.append(file)
41
+
42
+ # Download, read and concatenate CSV files into a pandas DataFrame
43
+ df_list = []
44
+ for log_file in log_files:
45
+ # Download the file
46
+ obj = s3.get_object(Bucket=bucket_name, Key=log_file)
47
+ csv_content = obj['Body'].read().decode('utf-8')
48
+
49
+ # Read CSV content into pandas DataFrame
50
+ try:
51
+ df = pd.read_csv(StringIO(csv_content))
52
+ except Exception as e:
53
+ print("Could not load in log file:", log_file, "due to:", e)
54
+ continue
55
+
56
+ df_list.append(df)
57
+
58
+ # Concatenate all DataFrames
59
+ if df_list:
60
+ concatenated_df = pd.concat(df_list, ignore_index=True)
61
+
62
+ # Save the concatenated DataFrame to a CSV file
63
+ concatenated_df.to_csv('consolidated_logs.csv', index=False)
64
+ print("Consolidated CSV saved as 'consolidated_logs.csv'")
65
+ else:
66
+ print("No log files found in the given date range.")
requirements.txt CHANGED
@@ -11,17 +11,15 @@ scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
- #gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
15
- https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
16
  boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
- #gradio_image_annotation==0.2.5
23
- # The following version includes rotation and image zoom options
24
- https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
25
  rapidfuzz==3.12.1
26
  python-dotenv==1.0.1
27
  numpy==1.26.4
 
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
+ gradio==5.25.2
 
15
  boto3==1.37.29
16
  pyarrow==19.0.1
17
  openpyxl==3.1.5
18
  Faker==36.1.1
19
  python-levenshtein==0.26.1
20
  spaczz==0.6.1
21
+ # The following version
22
+ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.1/gradio_image_annotation-0.3.1-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 
23
  rapidfuzz==3.12.1
24
  python-dotenv==1.0.1
25
  numpy==1.26.4
tools/file_conversion.py CHANGED
@@ -1036,7 +1036,7 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
1036
  # Check for NaN or infinite values in query_coords and filter them out
1037
  finite_mask = np.isfinite(query_coords).all(axis=1)
1038
  if not finite_mask.all():
1039
- print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1040
  query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1041
  else:
1042
  pass
 
1036
  # Check for NaN or infinite values in query_coords and filter them out
1037
  finite_mask = np.isfinite(query_coords).all(axis=1)
1038
  if not finite_mask.all():
1039
+ #print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1040
  query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1041
  else:
1042
  pass
tools/redaction_review.py CHANGED
@@ -351,7 +351,8 @@ def update_annotator_object_and_filter_df(
351
  show_share_button=False,
352
  show_remove_button=False,
353
  handles_cursor=True,
354
- interactive=True
 
355
  )
356
 
357
  return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
 
351
  show_share_button=False,
352
  show_remove_button=False,
353
  handles_cursor=True,
354
+ interactive=True,
355
+ use_default_label=True
356
  )
357
 
358
  return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
tools/textract_batch_call.py CHANGED
@@ -10,15 +10,8 @@ from io import StringIO
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
- # MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
14
- # MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
15
- # MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
16
- # MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
17
- # MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
18
- # MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
19
  from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
20
- from tools.aws_textract import json_to_ocrresult
21
-
22
 
23
  def analyse_document_with_textract_api(
24
  local_pdf_path: str,
@@ -202,9 +195,13 @@ def analyse_document_with_textract_api(
202
  def return_job_status(job_id:str,
203
  response:dict,
204
  attempts:int,
205
- poll_interval_seconds: int = 5,
206
  max_polling_attempts: int = 1 # ~10 minutes total wait time
207
  ):
 
 
 
 
208
  job_status = response['JobStatus']
209
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
210
 
@@ -232,7 +229,11 @@ def download_textract_job_files(s3_client:str,
232
  s3_output_key_prefix:str,
233
  pdf_filename:str,
234
  job_id:str,
235
- local_output_dir:str):
 
 
 
 
236
  list_response = s3_client.list_objects_v2(
237
  Bucket=s3_bucket_name,
238
  Prefix=s3_output_key_prefix
@@ -329,9 +330,13 @@ def poll_bulk_textract_analysis_progress_and_download(
329
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
330
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
331
  aws_region: str = AWS_REGION, # Optional: specify region if not default
 
332
  poll_interval_seconds: int = 1,
333
  max_polling_attempts: int = 1 # ~10 minutes total wait time):
334
  ):
 
 
 
335
 
336
  if job_id:
337
  # Initialize boto3 clients
@@ -349,7 +354,7 @@ def poll_bulk_textract_analysis_progress_and_download(
349
 
350
  # Update Textract document history df
351
  try:
352
- job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
353
  load_s3_jobs_loc=load_s3_jobs_loc,
354
  load_local_jobs_loc=load_local_jobs_loc)
355
  except Exception as e:
@@ -431,14 +436,15 @@ def poll_bulk_textract_analysis_progress_and_download(
431
 
432
  return downloaded_file_path, job_status, job_df
433
 
434
-
435
-
436
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
437
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
438
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
439
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
440
  aws_region:str=AWS_REGION):
441
-
 
 
 
442
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
443
 
444
  # Initialize boto3 clients
@@ -478,7 +484,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
478
 
479
  return job_df
480
 
481
-
482
  def download_textract_output(job_id:str,
483
  output_bucket:str,
484
  output_prefix:str,
@@ -518,4 +523,4 @@ def download_textract_output(job_id:str,
518
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
519
  print(f"Output file downloaded to: {local_file_path}")
520
  except Exception as e:
521
- print(f"Error downloading file: {e}")
 
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
 
 
 
 
 
 
13
  from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
14
+ #from tools.aws_textract import json_to_ocrresult
 
15
 
16
  def analyse_document_with_textract_api(
17
  local_pdf_path: str,
 
195
  def return_job_status(job_id:str,
196
  response:dict,
197
  attempts:int,
198
+ poll_interval_seconds: int = 0,
199
  max_polling_attempts: int = 1 # ~10 minutes total wait time
200
  ):
201
+ '''
202
+ Poll Textract for the current status of a previously-submitted job.
203
+ '''
204
+
205
  job_status = response['JobStatus']
206
  logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
207
 
 
229
  s3_output_key_prefix:str,
230
  pdf_filename:str,
231
  job_id:str,
232
+ local_output_dir:str):
233
+ '''
234
+ Download and combine selected job files from the AWS Textract service.
235
+ '''
236
+
237
  list_response = s3_client.list_objects_v2(
238
  Bucket=s3_bucket_name,
239
  Prefix=s3_output_key_prefix
 
330
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
331
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
332
  aws_region: str = AWS_REGION, # Optional: specify region if not default
333
+ load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
334
  poll_interval_seconds: int = 1,
335
  max_polling_attempts: int = 1 # ~10 minutes total wait time):
336
  ):
337
+ '''
338
+ Poll AWS for the status of a Textract API job. Return status, and if finished, combine and download results into a locally-stored json file for further processing by the app.
339
+ '''
340
 
341
  if job_id:
342
  # Initialize boto3 clients
 
354
 
355
  # Update Textract document history df
356
  try:
357
+ job_df = load_in_textract_job_details(load_s3_jobs=load_jobs_from_s3,
358
  load_s3_jobs_loc=load_s3_jobs_loc,
359
  load_local_jobs_loc=load_local_jobs_loc)
360
  except Exception as e:
 
436
 
437
  return downloaded_file_path, job_status, job_df
438
 
 
 
439
  def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
440
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
441
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
442
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
443
  aws_region:str=AWS_REGION):
444
+ '''
445
+ Load in a dataframe of jobs previous submitted to the Textract API service.
446
+ '''
447
+
448
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
449
 
450
  # Initialize boto3 clients
 
484
 
485
  return job_df
486
 
 
487
  def download_textract_output(job_id:str,
488
  output_bucket:str,
489
  output_prefix:str,
 
523
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
524
  print(f"Output file downloaded to: {local_file_path}")
525
  except Exception as e:
526
+ print(f"Error downloading file: {e}")