Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files

xet

Community

seanpedrickcase commited on 28 days ago

Commit

826ed50

1 Parent(s): 0e9dd2d

Added further file limits to deduplication and file load functions

Browse files

Files changed (4) hide show

cli_redact.py +2 -4
tools/file_conversion.py +6 -1
tools/find_duplicate_pages.py +25 -7
tools/find_duplicate_tabular.py +18 -2

cli_redact.py CHANGED Viewed

@@ -57,8 +57,6 @@ def _get_env_list(env_var_name: str) -> list[str]:
     # Split by comma and filter out any empty strings that might result from extra commas
     return [s.strip() for s in value.split(',') if s.strip()]
 # --- Constants and Configuration ---
 if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
@@ -133,8 +131,8 @@ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_profe
 ## Redact specific pages with AWS OCR and signature extraction:
 python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
-## Redact with AWS OCR and additional extraction options:
-python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_forms --extract_tables --extract_layout
 # Duplicate page detection

     # Split by comma and filter out any empty strings that might result from extra commas
     return [s.strip() for s in value.split(',') if s.strip()]
 # --- Constants and Configuration ---
 if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
 ## Redact specific pages with AWS OCR and signature extraction:
 python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+## Redact with AWS OCR and additional layout extraction options:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
 # Duplicate page detection

tools/file_conversion.py CHANGED Viewed

@@ -23,7 +23,7 @@ import random
 import string
 import warnings # To warn about potential type changes
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
 from tools.helper_functions import get_file_name_without_type, read_file
 # from tools.aws_textract import load_and_convert_textract_json
@@ -568,6 +568,11 @@ def prepare_image_or_pdf(
     if isinstance(file_paths, str): file_path_number = 1
     else: file_path_number = len(file_paths)
     latest_file_completed = int(latest_file_completed)

 import string
 import warnings # To warn about potential type changes
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, MAX_SIMULTANEOUS_FILES
 from tools.helper_functions import get_file_name_without_type, read_file
 # from tools.aws_textract import load_and_convert_textract_json
     if isinstance(file_paths, str): file_path_number = 1
     else: file_path_number = len(file_paths)
+    if file_path_number > MAX_SIMULTANEOUS_FILES:
+        out_message = f"Number of files loaded is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+        print(out_message)
+        raise Exception(out_message)
     latest_file_completed = int(latest_file_completed)

tools/find_duplicate_pages.py CHANGED Viewed

@@ -11,6 +11,7 @@ from gradio import Progress
 from pathlib import Path
 from typing import List
 from tools.helper_functions import OUTPUT_FOLDER
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 from tools.load_spacy_model_custom_recognisers import nlp
@@ -603,8 +604,6 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
     return output_paths
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """
     Helper function to compare two sequences of tokens with punctuation flexibility.
@@ -640,7 +639,6 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     # If the loop completes, every token has matched.
     return True
 def find_consecutive_sequence_matches(
     df_filtered: pd.DataFrame,
     search_file_name: str,
@@ -895,12 +893,32 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUT
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
-def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
     """
-    Wrapper function updated to include the 'greedy_match' boolean.
     """
-    if not files:
-        raise Warning("Please upload files to analyse.")
     start_time = time.time()

 from pathlib import Path
 from typing import List
 from tools.helper_functions import OUTPUT_FOLDER
+from tools.config import MAX_SIMULTANEOUS_FILES
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 from tools.load_spacy_model_custom_recognisers import nlp
     return output_paths
 def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
     """
     Helper function to compare two sequences of tokens with punctuation flexibility.
     # If the loop completes, every token has matched.
     return True
 def find_consecutive_sequence_matches(
     df_filtered: pd.DataFrame,
     search_file_name: str,
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
+def run_duplicate_analysis(files:list[str], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
     """
+    Main wrapper function to orchestrate the duplicate page analysis process.
+    It handles file loading, text combination, similarity identification,
+    and result saving.
+    Args:
+        files (list[str]): A list of file paths (PDFs, etc.) to be analyzed for duplicate content.
+        threshold (float): The similarity threshold (0.0 to 1.0) above which text segments are considered duplicates.
+        min_words (int): The minimum number of words a text segment must contain to be included in the analysis.
+        min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
+        greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
+        combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
+        preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
+        output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
+        progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
     """
+    if not files: raise Warning("Please upload files to analyse.")
+    if isinstance(files, str): files = [files]
+    if len(files) > MAX_SIMULTANEOUS_FILES:
+        out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+        print(out_message)
+        raise Exception(out_message)
     start_time = time.time()

tools/find_duplicate_tabular.py CHANGED Viewed

@@ -11,7 +11,7 @@ from pathlib import Path
 from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.data_anonymise import initial_clean
 from tools.load_spacy_model_custom_recognisers import nlp
-from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS
 if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
 else: REMOVE_DUPLICATE_ROWS = False
@@ -139,6 +139,12 @@ def find_duplicate_cells_in_tabular_data(
                     # If sheet was successfully_loaded
                     if not temp_df.empty:
                         file_name = os.path.basename(file_path) + "_" + sheet_name
                         file_paths.append(file_path)
@@ -154,6 +160,11 @@ def find_duplicate_cells_in_tabular_data(
                     temp_df = pd.DataFrame()
             else:
                 temp_df = read_file(file_path)
                 file_name = os.path.basename(file_path)
                 file_paths.append(file_path)
@@ -528,7 +539,7 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
     # If output folder doesn't end with a forward slash, add one
     if not output_folder.endswith('/'): output_folder = output_folder + '/'
-    file_paths = []
     if isinstance(files, str):
         # If 'files' is a single string, treat it as a list with one element
         file_paths.append(files)
@@ -551,6 +562,11 @@ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, o
         # Raise an error for any other unexpected type of the 'files' argument itself
         raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
     results_df, output_paths, full_data = run_tabular_duplicate_analysis(
         files=file_paths,
         threshold=threshold,

 from tools.helper_functions import OUTPUT_FOLDER, read_file
 from tools.data_anonymise import initial_clean
 from tools.load_spacy_model_custom_recognisers import nlp
+from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS, MAX_SIMULTANEOUS_FILES, MAX_TABULAR_ROWS
 if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
 else: REMOVE_DUPLICATE_ROWS = False
                     # If sheet was successfully_loaded
                     if not temp_df.empty:
+                        if temp_df.shape[0] > MAX_TABULAR_ROWS:
+                            out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
+                            print(out_message)
+                            raise Exception(out_message)
                         file_name = os.path.basename(file_path) + "_" + sheet_name
                         file_paths.append(file_path)
                     temp_df = pd.DataFrame()
             else:
                 temp_df = read_file(file_path)
+                if temp_df.shape[0] > MAX_TABULAR_ROWS:
+                    out_message = f"Number of rows in {file_path} is greater than {MAX_TABULAR_ROWS}. Please submit a smaller file."
+                    print(out_message)
+                    raise Exception(out_message)
                 file_name = os.path.basename(file_path)
                 file_paths.append(file_path)
     # If output folder doesn't end with a forward slash, add one
     if not output_folder.endswith('/'): output_folder = output_folder + '/'
+    file_paths = list()
     if isinstance(files, str):
         # If 'files' is a single string, treat it as a list with one element
         file_paths.append(files)
         # Raise an error for any other unexpected type of the 'files' argument itself
         raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
+    if len(file_paths) > MAX_SIMULTANEOUS_FILES:
+        out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+        print(out_message)
+        raise Exception(out_message)
     results_df, output_paths, full_data = run_tabular_duplicate_analysis(
         files=file_paths,
         threshold=threshold,