seanpedrickcase commited on
Commit
826ed50
·
1 Parent(s): 0e9dd2d

Added further file limits to deduplication and file load functions

Browse files
cli_redact.py CHANGED
@@ -57,8 +57,6 @@ def _get_env_list(env_var_name: str) -> list[str]:
57
  # Split by comma and filter out any empty strings that might result from extra commas
58
  return [s.strip() for s in value.split(',') if s.strip()]
59
 
60
-
61
-
62
  # --- Constants and Configuration ---
63
 
64
  if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
@@ -133,8 +131,8 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
133
  ## Redact specific pages with AWS OCR and signature extraction:
134
  python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
135
 
136
- ## Redact with AWS OCR and additional extraction options:
137
- python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_forms --extract_tables --extract_layout
138
 
139
  # Duplicate page detection
140
 
 
57
  # Split by comma and filter out any empty strings that might result from extra commas
58
  return [s.strip() for s in value.split(',') if s.strip()]
59
 
 
 
60
  # --- Constants and Configuration ---
61
 
62
  if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
 
131
  ## Redact specific pages with AWS OCR and signature extraction:
132
  python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
133
 
134
+ ## Redact with AWS OCR and additional layout extraction options:
135
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
136
 
137
  # Duplicate page detection
138
 
tools/file_conversion.py CHANGED
@@ -23,7 +23,7 @@ import random
23
  import string
24
  import warnings # To warn about potential type changes
25
 
26
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
27
  from tools.helper_functions import get_file_name_without_type, read_file
28
  # from tools.aws_textract import load_and_convert_textract_json
29
 
@@ -568,6 +568,11 @@ def prepare_image_or_pdf(
568
 
569
  if isinstance(file_paths, str): file_path_number = 1
570
  else: file_path_number = len(file_paths)
 
 
 
 
 
571
 
572
  latest_file_completed = int(latest_file_completed)
573
 
 
23
  import string
24
  import warnings # To warn about potential type changes
25
 
26
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, MAX_SIMULTANEOUS_FILES
27
  from tools.helper_functions import get_file_name_without_type, read_file
28
  # from tools.aws_textract import load_and_convert_textract_json
29
 
 
568
 
569
  if isinstance(file_paths, str): file_path_number = 1
570
  else: file_path_number = len(file_paths)
571
+
572
+ if file_path_number > MAX_SIMULTANEOUS_FILES:
573
+ out_message = f"Number of files loaded is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
574
+ print(out_message)
575
+ raise Exception(out_message)
576
 
577
  latest_file_completed = int(latest_file_completed)
578
 
tools/find_duplicate_pages.py CHANGED
@@ -11,6 +11,7 @@ from gradio import Progress
11
  from pathlib import Path
12
  from typing import List
13
  from tools.helper_functions import OUTPUT_FOLDER
 
14
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
15
  from tools.load_spacy_model_custom_recognisers import nlp
16
 
@@ -603,8 +604,6 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
603
 
604
  return output_paths
605
 
606
-
607
-
608
  def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
609
  """
610
  Helper function to compare two sequences of tokens with punctuation flexibility.
@@ -640,7 +639,6 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
640
  # If the loop completes, every token has matched.
641
  return True
642
 
643
-
644
  def find_consecutive_sequence_matches(
645
  df_filtered: pd.DataFrame,
646
  search_file_name: str,
@@ -895,12 +893,32 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUT
895
  # Return the updated dataframe, the new file list, and clear the preview panes
896
  return updated_df, new_output_paths, None, None
897
 
898
- def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
899
  """
900
- Wrapper function updated to include the 'greedy_match' boolean.
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  """
902
- if not files:
903
- raise Warning("Please upload files to analyse.")
 
 
 
 
 
 
 
904
 
905
  start_time = time.time()
906
 
 
11
  from pathlib import Path
12
  from typing import List
13
  from tools.helper_functions import OUTPUT_FOLDER
14
+ from tools.config import MAX_SIMULTANEOUS_FILES
15
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
16
  from tools.load_spacy_model_custom_recognisers import nlp
17
 
 
604
 
605
  return output_paths
606
 
 
 
607
  def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
608
  """
609
  Helper function to compare two sequences of tokens with punctuation flexibility.
 
639
  # If the loop completes, every token has matched.
640
  return True
641
 
 
642
  def find_consecutive_sequence_matches(
643
  df_filtered: pd.DataFrame,
644
  search_file_name: str,
 
893
  # Return the updated dataframe, the new file list, and clear the preview panes
894
  return updated_df, new_output_paths, None, None
895
 
896
+ def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
897
  """
898
+ Main wrapper function to orchestrate the duplicate page analysis process.
899
+ It handles file loading, text combination, similarity identification,
900
+ and result saving.
901
+
902
+ Args:
903
+ files (list[str]): A list of file paths (PDFs, etc.) to be analyzed for duplicate content.
904
+ threshold (float): The similarity threshold (0.0 to 1.0) above which text segments are considered duplicates.
905
+ min_words (int): The minimum number of words a text segment must contain to be included in the analysis.
906
+ min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
907
+ greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
908
+ combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
909
+ preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
910
+ output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
911
+ progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
912
  """
913
+
914
+ if not files: raise Warning("Please upload files to analyse.")
915
+
916
+ if isinstance(files, str): files = [files]
917
+
918
+ if len(files) > MAX_SIMULTANEOUS_FILES:
919
+ out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
920
+ print(out_message)
921
+ raise Exception(out_message)
922
 
923
  start_time = time.time()
924
 
tools/find_duplicate_tabular.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
  from tools.helper_functions import OUTPUT_FOLDER, read_file
12
  from tools.data_anonymise import initial_clean
13
  from tools.load_spacy_model_custom_recognisers import nlp
14
- from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS
15
 
16
  if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
17
  else: REMOVE_DUPLICATE_ROWS = False
@@ -139,6 +139,12 @@ def find_duplicate_cells_in_tabular_data(
139
 
140
  # If sheet was successfully_loaded
141
  if not temp_df.empty:
 
 
 
 
 
 
142
  file_name = os.path.basename(file_path) + "_" + sheet_name
143
  file_paths.append(file_path)
144
 
@@ -154,6 +160,11 @@ def find_duplicate_cells_in_tabular_data(
154
  temp_df = pd.DataFrame()
155
  else:
156
  temp_df = read_file(file_path)
 
 
 
 
 
157
 
158
  file_name = os.path.basename(file_path)
159
  file_paths.append(file_path)
@@ -528,7 +539,7 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
528
  # If output folder doesn't end with a forward slash, add one
529
  if not output_folder.endswith('/'): output_folder = output_folder + '/'
530
 
531
- file_paths = []
532
  if isinstance(files, str):
533
  # If 'files' is a single string, treat it as a list with one element
534
  file_paths.append(files)
@@ -551,6 +562,11 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
551
  # Raise an error for any other unexpected type of the 'files' argument itself
552
  raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
553
 
 
 
 
 
 
554
  results_df, output_paths, full_data = run_tabular_duplicate_analysis(
555
  files=file_paths,
556
  threshold=threshold,
 
11
  from tools.helper_functions import OUTPUT_FOLDER, read_file
12
  from tools.data_anonymise import initial_clean
13
  from tools.load_spacy_model_custom_recognisers import nlp
14
+ from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS, MAX_SIMULTANEOUS_FILES, MAX_TABULAR_ROWS
15
 
16
  if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
17
  else: REMOVE_DUPLICATE_ROWS = False
 
139
 
140
  # If sheet was successfully_loaded
141
  if not temp_df.empty:
142
+
143
+ if temp_df.shape[0] > MAX_TABULAR_ROWS:
144
+ out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
145
+ print(out_message)
146
+ raise Exception(out_message)
147
+
148
  file_name = os.path.basename(file_path) + "_" + sheet_name
149
  file_paths.append(file_path)
150
 
 
160
  temp_df = pd.DataFrame()
161
  else:
162
  temp_df = read_file(file_path)
163
+
164
+ if temp_df.shape[0] > MAX_TABULAR_ROWS:
165
+ out_message = f"Number of rows in {file_path} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
166
+ print(out_message)
167
+ raise Exception(out_message)
168
 
169
  file_name = os.path.basename(file_path)
170
  file_paths.append(file_path)
 
539
  # If output folder doesn't end with a forward slash, add one
540
  if not output_folder.endswith('/'): output_folder = output_folder + '/'
541
 
542
+ file_paths = list()
543
  if isinstance(files, str):
544
  # If 'files' is a single string, treat it as a list with one element
545
  file_paths.append(files)
 
562
  # Raise an error for any other unexpected type of the 'files' argument itself
563
  raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
564
 
565
+ if len(file_paths) > MAX_SIMULTANEOUS_FILES:
566
+ out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
567
+ print(out_message)
568
+ raise Exception(out_message)
569
+
570
  results_df, output_paths, full_data = run_tabular_duplicate_analysis(
571
  files=file_paths,
572
  threshold=threshold,