Commit
·
826ed50
1
Parent(s):
0e9dd2d
Added further file limits to deduplication and file load functions
Browse files- cli_redact.py +2 -4
- tools/file_conversion.py +6 -1
- tools/find_duplicate_pages.py +25 -7
- tools/find_duplicate_tabular.py +18 -2
cli_redact.py
CHANGED
@@ -57,8 +57,6 @@ def _get_env_list(env_var_name: str) -> list[str]:
|
|
57 |
# Split by comma and filter out any empty strings that might result from extra commas
|
58 |
return [s.strip() for s in value.split(',') if s.strip()]
|
59 |
|
60 |
-
|
61 |
-
|
62 |
# --- Constants and Configuration ---
|
63 |
|
64 |
if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
|
@@ -133,8 +131,8 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
|
|
133 |
## Redact specific pages with AWS OCR and signature extraction:
|
134 |
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
|
135 |
|
136 |
-
## Redact with AWS OCR and additional extraction options:
|
137 |
-
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --
|
138 |
|
139 |
# Duplicate page detection
|
140 |
|
|
|
57 |
# Split by comma and filter out any empty strings that might result from extra commas
|
58 |
return [s.strip() for s in value.split(',') if s.strip()]
|
59 |
|
|
|
|
|
60 |
# --- Constants and Configuration ---
|
61 |
|
62 |
if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
|
|
|
131 |
## Redact specific pages with AWS OCR and signature extraction:
|
132 |
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
|
133 |
|
134 |
+
## Redact with AWS OCR and additional layout extraction options:
|
135 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
|
136 |
|
137 |
# Duplicate page detection
|
138 |
|
tools/file_conversion.py
CHANGED
@@ -23,7 +23,7 @@ import random
|
|
23 |
import string
|
24 |
import warnings # To warn about potential type changes
|
25 |
|
26 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
|
27 |
from tools.helper_functions import get_file_name_without_type, read_file
|
28 |
# from tools.aws_textract import load_and_convert_textract_json
|
29 |
|
@@ -568,6 +568,11 @@ def prepare_image_or_pdf(
|
|
568 |
|
569 |
if isinstance(file_paths, str): file_path_number = 1
|
570 |
else: file_path_number = len(file_paths)
|
|
|
|
|
|
|
|
|
|
|
571 |
|
572 |
latest_file_completed = int(latest_file_completed)
|
573 |
|
|
|
23 |
import string
|
24 |
import warnings # To warn about potential type changes
|
25 |
|
26 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, MAX_SIMULTANEOUS_FILES
|
27 |
from tools.helper_functions import get_file_name_without_type, read_file
|
28 |
# from tools.aws_textract import load_and_convert_textract_json
|
29 |
|
|
|
568 |
|
569 |
if isinstance(file_paths, str): file_path_number = 1
|
570 |
else: file_path_number = len(file_paths)
|
571 |
+
|
572 |
+
if file_path_number > MAX_SIMULTANEOUS_FILES:
|
573 |
+
out_message = f"Number of files loaded is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
|
574 |
+
print(out_message)
|
575 |
+
raise Exception(out_message)
|
576 |
|
577 |
latest_file_completed = int(latest_file_completed)
|
578 |
|
tools/find_duplicate_pages.py
CHANGED
@@ -11,6 +11,7 @@ from gradio import Progress
|
|
11 |
from pathlib import Path
|
12 |
from typing import List
|
13 |
from tools.helper_functions import OUTPUT_FOLDER
|
|
|
14 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
15 |
from tools.load_spacy_model_custom_recognisers import nlp
|
16 |
|
@@ -603,8 +604,6 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
|
|
603 |
|
604 |
return output_paths
|
605 |
|
606 |
-
|
607 |
-
|
608 |
def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
|
609 |
"""
|
610 |
Helper function to compare two sequences of tokens with punctuation flexibility.
|
@@ -640,7 +639,6 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
|
|
640 |
# If the loop completes, every token has matched.
|
641 |
return True
|
642 |
|
643 |
-
|
644 |
def find_consecutive_sequence_matches(
|
645 |
df_filtered: pd.DataFrame,
|
646 |
search_file_name: str,
|
@@ -895,12 +893,32 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUT
|
|
895 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
896 |
return updated_df, new_output_paths, None, None
|
897 |
|
898 |
-
def run_duplicate_analysis(files:list[
|
899 |
"""
|
900 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
901 |
"""
|
902 |
-
|
903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
904 |
|
905 |
start_time = time.time()
|
906 |
|
|
|
11 |
from pathlib import Path
|
12 |
from typing import List
|
13 |
from tools.helper_functions import OUTPUT_FOLDER
|
14 |
+
from tools.config import MAX_SIMULTANEOUS_FILES
|
15 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
16 |
from tools.load_spacy_model_custom_recognisers import nlp
|
17 |
|
|
|
604 |
|
605 |
return output_paths
|
606 |
|
|
|
|
|
607 |
def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
|
608 |
"""
|
609 |
Helper function to compare two sequences of tokens with punctuation flexibility.
|
|
|
639 |
# If the loop completes, every token has matched.
|
640 |
return True
|
641 |
|
|
|
642 |
def find_consecutive_sequence_matches(
|
643 |
df_filtered: pd.DataFrame,
|
644 |
search_file_name: str,
|
|
|
893 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
894 |
return updated_df, new_output_paths, None, None
|
895 |
|
896 |
+
def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
|
897 |
"""
|
898 |
+
Main wrapper function to orchestrate the duplicate page analysis process.
|
899 |
+
It handles file loading, text combination, similarity identification,
|
900 |
+
and result saving.
|
901 |
+
|
902 |
+
Args:
|
903 |
+
files (list[str]): A list of file paths (PDFs, etc.) to be analyzed for duplicate content.
|
904 |
+
threshold (float): The similarity threshold (0.0 to 1.0) above which text segments are considered duplicates.
|
905 |
+
min_words (int): The minimum number of words a text segment must contain to be included in the analysis.
|
906 |
+
min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
|
907 |
+
greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
|
908 |
+
combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
|
909 |
+
preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
|
910 |
+
output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
|
911 |
+
progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
|
912 |
"""
|
913 |
+
|
914 |
+
if not files: raise Warning("Please upload files to analyse.")
|
915 |
+
|
916 |
+
if isinstance(files, str): files = [files]
|
917 |
+
|
918 |
+
if len(files) > MAX_SIMULTANEOUS_FILES:
|
919 |
+
out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
|
920 |
+
print(out_message)
|
921 |
+
raise Exception(out_message)
|
922 |
|
923 |
start_time = time.time()
|
924 |
|
tools/find_duplicate_tabular.py
CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
|
|
11 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
12 |
from tools.data_anonymise import initial_clean
|
13 |
from tools.load_spacy_model_custom_recognisers import nlp
|
14 |
-
from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS
|
15 |
|
16 |
if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
|
17 |
else: REMOVE_DUPLICATE_ROWS = False
|
@@ -139,6 +139,12 @@ def find_duplicate_cells_in_tabular_data(
|
|
139 |
|
140 |
# If sheet was successfully_loaded
|
141 |
if not temp_df.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
file_name = os.path.basename(file_path) + "_" + sheet_name
|
143 |
file_paths.append(file_path)
|
144 |
|
@@ -154,6 +160,11 @@ def find_duplicate_cells_in_tabular_data(
|
|
154 |
temp_df = pd.DataFrame()
|
155 |
else:
|
156 |
temp_df = read_file(file_path)
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
file_name = os.path.basename(file_path)
|
159 |
file_paths.append(file_path)
|
@@ -528,7 +539,7 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
|
|
528 |
# If output folder doesn't end with a forward slash, add one
|
529 |
if not output_folder.endswith('/'): output_folder = output_folder + '/'
|
530 |
|
531 |
-
file_paths =
|
532 |
if isinstance(files, str):
|
533 |
# If 'files' is a single string, treat it as a list with one element
|
534 |
file_paths.append(files)
|
@@ -551,6 +562,11 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
|
|
551 |
# Raise an error for any other unexpected type of the 'files' argument itself
|
552 |
raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
|
553 |
|
|
|
|
|
|
|
|
|
|
|
554 |
results_df, output_paths, full_data = run_tabular_duplicate_analysis(
|
555 |
files=file_paths,
|
556 |
threshold=threshold,
|
|
|
11 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
12 |
from tools.data_anonymise import initial_clean
|
13 |
from tools.load_spacy_model_custom_recognisers import nlp
|
14 |
+
from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS, MAX_SIMULTANEOUS_FILES, MAX_TABULAR_ROWS
|
15 |
|
16 |
if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
|
17 |
else: REMOVE_DUPLICATE_ROWS = False
|
|
|
139 |
|
140 |
# If sheet was successfully_loaded
|
141 |
if not temp_df.empty:
|
142 |
+
|
143 |
+
if temp_df.shape[0] > MAX_TABULAR_ROWS:
|
144 |
+
out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
|
145 |
+
print(out_message)
|
146 |
+
raise Exception(out_message)
|
147 |
+
|
148 |
file_name = os.path.basename(file_path) + "_" + sheet_name
|
149 |
file_paths.append(file_path)
|
150 |
|
|
|
160 |
temp_df = pd.DataFrame()
|
161 |
else:
|
162 |
temp_df = read_file(file_path)
|
163 |
+
|
164 |
+
if temp_df.shape[0] > MAX_TABULAR_ROWS:
|
165 |
+
out_message = f"Number of rows in {file_path} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
|
166 |
+
print(out_message)
|
167 |
+
raise Exception(out_message)
|
168 |
|
169 |
file_name = os.path.basename(file_path)
|
170 |
file_paths.append(file_path)
|
|
|
539 |
# If output folder doesn't end with a forward slash, add one
|
540 |
if not output_folder.endswith('/'): output_folder = output_folder + '/'
|
541 |
|
542 |
+
file_paths = list()
|
543 |
if isinstance(files, str):
|
544 |
# If 'files' is a single string, treat it as a list with one element
|
545 |
file_paths.append(files)
|
|
|
562 |
# Raise an error for any other unexpected type of the 'files' argument itself
|
563 |
raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
|
564 |
|
565 |
+
if len(file_paths) > MAX_SIMULTANEOUS_FILES:
|
566 |
+
out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
|
567 |
+
print(out_message)
|
568 |
+
raise Exception(out_message)
|
569 |
+
|
570 |
results_df, output_paths, full_data = run_tabular_duplicate_analysis(
|
571 |
files=file_paths,
|
572 |
threshold=threshold,
|