Spaces:

seanpedrickcase
/

document_redaction

Running

seanpedrickcase commited on Oct 23

Commit

b3d51df

1 Parent(s): 3dd6d75

Formatter and linter check

Files changed (3) hide show

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -499,13 +499,15 @@ class CustomImageAnalyzerEngine:
         self.language = language or DEFAULT_LANGUAGE or "en"
         self.tesseract_lang = _tesseract_lang_code(self.language)
         self.paddle_lang = _paddle_lang_code(self.language)
         # Security: Validate and normalize output_folder at construction time
         # This ensures the object is always in a secure state and prevents
         # any future code from accidentally using an untrusted directory
         normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
         if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
-            raise ValueError(f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}")
         self.output_folder = normalized_output_folder
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":

         self.language = language or DEFAULT_LANGUAGE or "en"
         self.tesseract_lang = _tesseract_lang_code(self.language)
         self.paddle_lang = _paddle_lang_code(self.language)
         # Security: Validate and normalize output_folder at construction time
         # This ensures the object is always in a secure state and prevents
         # any future code from accidentally using an untrusted directory
         normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
         if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
+            raise ValueError(
+                f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}"
+            )
         self.output_folder = normalized_output_folder
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":

tools/file_conversion.py CHANGED Viewed

@@ -2813,12 +2813,9 @@ def fill_missing_ids(
         # warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
     df.loc[rows_to_fill_index, column_name] = new_ids_list
-    print(
-        f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
-    )
-    # Optional: Convert the entire column to string type at the end for consistency
-    # df[column_name] = df[column_name].astype(str)
     return df

         # warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
     df.loc[rows_to_fill_index, column_name] = new_ids_list
+    # print(
+    #     f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
+    # )
     return df

tools/find_duplicate_pages.py CHANGED Viewed

@@ -474,7 +474,7 @@ def combine_ocr_dataframes(
             raise ValueError(
                 f"Unsafe normalized output folder path: {normalized_output_folder}"
             )
         # Assign the validated path back to output_folder to ensure all subsequent
         # operations use the secure, validated value
         output_folder = normalized_output_folder

             raise ValueError(
                 f"Unsafe normalized output folder path: {normalized_output_folder}"
             )
         # Assign the validated path back to output_folder to ensure all subsequent
         # operations use the secure, validated value
         output_folder = normalized_output_folder