seanpedrickcase commited on
Commit
b3d51df
·
1 Parent(s): 3dd6d75

Formatter and linter check

Browse files
tools/custom_image_analyser_engine.py CHANGED
@@ -499,13 +499,15 @@ class CustomImageAnalyzerEngine:
499
  self.language = language or DEFAULT_LANGUAGE or "en"
500
  self.tesseract_lang = _tesseract_lang_code(self.language)
501
  self.paddle_lang = _paddle_lang_code(self.language)
502
-
503
  # Security: Validate and normalize output_folder at construction time
504
  # This ensures the object is always in a secure state and prevents
505
  # any future code from accidentally using an untrusted directory
506
  normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
507
  if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
508
- raise ValueError(f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}")
 
 
509
  self.output_folder = normalized_output_folder
510
 
511
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
 
499
  self.language = language or DEFAULT_LANGUAGE or "en"
500
  self.tesseract_lang = _tesseract_lang_code(self.language)
501
  self.paddle_lang = _paddle_lang_code(self.language)
502
+
503
  # Security: Validate and normalize output_folder at construction time
504
  # This ensures the object is always in a secure state and prevents
505
  # any future code from accidentally using an untrusted directory
506
  normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
507
  if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
508
+ raise ValueError(
509
+ f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}"
510
+ )
511
  self.output_folder = normalized_output_folder
512
 
513
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
tools/file_conversion.py CHANGED
@@ -2813,12 +2813,9 @@ def fill_missing_ids(
2813
  # warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
2814
 
2815
  df.loc[rows_to_fill_index, column_name] = new_ids_list
2816
- print(
2817
- f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
2818
- )
2819
-
2820
- # Optional: Convert the entire column to string type at the end for consistency
2821
- # df[column_name] = df[column_name].astype(str)
2822
 
2823
  return df
2824
 
 
2813
  # warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
2814
 
2815
  df.loc[rows_to_fill_index, column_name] = new_ids_list
2816
+ # print(
2817
+ # f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
2818
+ # )
 
 
 
2819
 
2820
  return df
2821
 
tools/find_duplicate_pages.py CHANGED
@@ -474,7 +474,7 @@ def combine_ocr_dataframes(
474
  raise ValueError(
475
  f"Unsafe normalized output folder path: {normalized_output_folder}"
476
  )
477
-
478
  # Assign the validated path back to output_folder to ensure all subsequent
479
  # operations use the secure, validated value
480
  output_folder = normalized_output_folder
 
474
  raise ValueError(
475
  f"Unsafe normalized output folder path: {normalized_output_folder}"
476
  )
477
+
478
  # Assign the validated path back to output_folder to ensure all subsequent
479
  # operations use the secure, validated value
480
  output_folder = normalized_output_folder