Commit
·
b3d51df
1
Parent(s):
3dd6d75
Formatter and linter check
Browse files
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -499,13 +499,15 @@ class CustomImageAnalyzerEngine:
|
|
| 499 |
self.language = language or DEFAULT_LANGUAGE or "en"
|
| 500 |
self.tesseract_lang = _tesseract_lang_code(self.language)
|
| 501 |
self.paddle_lang = _paddle_lang_code(self.language)
|
| 502 |
-
|
| 503 |
# Security: Validate and normalize output_folder at construction time
|
| 504 |
# This ensures the object is always in a secure state and prevents
|
| 505 |
# any future code from accidentally using an untrusted directory
|
| 506 |
normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
|
| 507 |
if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
|
| 508 |
-
raise ValueError(
|
|
|
|
|
|
|
| 509 |
self.output_folder = normalized_output_folder
|
| 510 |
|
| 511 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
|
|
|
| 499 |
self.language = language or DEFAULT_LANGUAGE or "en"
|
| 500 |
self.tesseract_lang = _tesseract_lang_code(self.language)
|
| 501 |
self.paddle_lang = _paddle_lang_code(self.language)
|
| 502 |
+
|
| 503 |
# Security: Validate and normalize output_folder at construction time
|
| 504 |
# This ensures the object is always in a secure state and prevents
|
| 505 |
# any future code from accidentally using an untrusted directory
|
| 506 |
normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
|
| 507 |
if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
|
| 508 |
+
raise ValueError(
|
| 509 |
+
f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}"
|
| 510 |
+
)
|
| 511 |
self.output_folder = normalized_output_folder
|
| 512 |
|
| 513 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
tools/file_conversion.py
CHANGED
|
@@ -2813,12 +2813,9 @@ def fill_missing_ids(
|
|
| 2813 |
# warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
|
| 2814 |
|
| 2815 |
df.loc[rows_to_fill_index, column_name] = new_ids_list
|
| 2816 |
-
print(
|
| 2817 |
-
|
| 2818 |
-
)
|
| 2819 |
-
|
| 2820 |
-
# Optional: Convert the entire column to string type at the end for consistency
|
| 2821 |
-
# df[column_name] = df[column_name].astype(str)
|
| 2822 |
|
| 2823 |
return df
|
| 2824 |
|
|
|
|
| 2813 |
# warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
|
| 2814 |
|
| 2815 |
df.loc[rows_to_fill_index, column_name] = new_ids_list
|
| 2816 |
+
# print(
|
| 2817 |
+
# f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
|
| 2818 |
+
# )
|
|
|
|
|
|
|
|
|
|
| 2819 |
|
| 2820 |
return df
|
| 2821 |
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -474,7 +474,7 @@ def combine_ocr_dataframes(
|
|
| 474 |
raise ValueError(
|
| 475 |
f"Unsafe normalized output folder path: {normalized_output_folder}"
|
| 476 |
)
|
| 477 |
-
|
| 478 |
# Assign the validated path back to output_folder to ensure all subsequent
|
| 479 |
# operations use the secure, validated value
|
| 480 |
output_folder = normalized_output_folder
|
|
|
|
| 474 |
raise ValueError(
|
| 475 |
f"Unsafe normalized output folder path: {normalized_output_folder}"
|
| 476 |
)
|
| 477 |
+
|
| 478 |
# Assign the validated path back to output_folder to ensure all subsequent
|
| 479 |
# operations use the secure, validated value
|
| 480 |
output_folder = normalized_output_folder
|