Commit
·
ad8fef5
1
Parent(s):
fcd55d0
Fixed duplicate page argument mismatch. Readded Windows tests. Added refresh token options to cdk. Package updates
Browse files- .github/workflows/multi-os-test.yml +34 -5
- cdk/cdk_config.py +10 -0
- cdk/cdk_stack.py +10 -0
- pyproject.toml +12 -12
- requirements.txt +11 -12
- tools/find_duplicate_pages.py +24 -26
- tools/redaction_review.py +0 -2
.github/workflows/multi-os-test.yml
CHANGED
@@ -15,12 +15,12 @@ jobs:
|
|
15 |
runs-on: ${{ matrix.os }}
|
16 |
strategy:
|
17 |
matrix:
|
18 |
-
os: [ubuntu-latest,
|
19 |
python-version: ["3.11", "3.12", "3.13"]
|
20 |
exclude:
|
21 |
# Exclude some combinations to reduce CI time
|
22 |
-
|
23 |
-
|
24 |
- os: macos-latest
|
25 |
python-version: "3.11"
|
26 |
|
@@ -51,8 +51,37 @@ jobs:
|
|
51 |
- name: Install system dependencies (Windows)
|
52 |
if: matrix.os == 'windows-latest'
|
53 |
run: |
|
54 |
-
#
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
- name: Install Python dependencies
|
58 |
run: |
|
|
|
15 |
runs-on: ${{ matrix.os }}
|
16 |
strategy:
|
17 |
matrix:
|
18 |
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
19 |
python-version: ["3.11", "3.12", "3.13"]
|
20 |
exclude:
|
21 |
# Exclude some combinations to reduce CI time
|
22 |
+
- os: windows-latest
|
23 |
+
python-version: "3.11"
|
24 |
- os: macos-latest
|
25 |
python-version: "3.11"
|
26 |
|
|
|
51 |
- name: Install system dependencies (Windows)
|
52 |
if: matrix.os == 'windows-latest'
|
53 |
run: |
|
54 |
+
# Create tools directory
|
55 |
+
mkdir C:\tools
|
56 |
+
|
57 |
+
# Download and install Tesseract
|
58 |
+
$tesseractUrl = "https://github.com/UB-Mannheim/tesseract/releases/download/v5.3.3.20231005/tesseract-ocr-w64-setup-5.3.3.20231005.exe"
|
59 |
+
$tesseractInstaller = "C:\tools\tesseract-installer.exe"
|
60 |
+
Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
|
61 |
+
|
62 |
+
# Install Tesseract silently
|
63 |
+
Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
|
64 |
+
|
65 |
+
# Download and extract Poppler
|
66 |
+
$popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v24.02.0-0/Release-24.02.0-0.zip"
|
67 |
+
$popplerZip = "C:\tools\poppler.zip"
|
68 |
+
Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
|
69 |
+
|
70 |
+
# Extract Poppler
|
71 |
+
Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
|
72 |
+
|
73 |
+
# Add to PATH
|
74 |
+
echo "C:\tools\tesseract" >> $env:GITHUB_PATH
|
75 |
+
echo "C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_PATH
|
76 |
+
|
77 |
+
# Set environment variables for your application
|
78 |
+
echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
|
79 |
+
echo "POPPLER_FOLDER=C:\tools\poppler\poppler-24.02.0\Library\bin" >> $env:GITHUB_ENV
|
80 |
+
echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
|
81 |
+
|
82 |
+
# Verify installation
|
83 |
+
tesseract --version
|
84 |
+
pdftoppm -v
|
85 |
|
86 |
- name: Install Python dependencies
|
87 |
run: |
|
cdk/cdk_config.py
CHANGED
@@ -219,6 +219,16 @@ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
|
|
219 |
"COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
|
220 |
) # Should change this to something unique or you'll probably hit an error
|
221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
# Application load balancer
|
223 |
ALB_NAME = get_or_create_env_var(
|
224 |
"ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
|
|
|
219 |
"COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
|
220 |
) # Should change this to something unique or you'll probably hit an error
|
221 |
|
222 |
+
COGNITO_REFRESH_TOKEN_VALIDITY = int(
|
223 |
+
get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
|
224 |
+
) # Minutes
|
225 |
+
COGNITO_ID_TOKEN_VALIDITY = int(
|
226 |
+
get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
|
227 |
+
) # Minutes
|
228 |
+
COGNITO_ACCESS_TOKEN_VALIDITY = int(
|
229 |
+
get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
|
230 |
+
) # Minutes
|
231 |
+
|
232 |
# Application load balancer
|
233 |
ALB_NAME = get_or_create_env_var(
|
234 |
"ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
|
cdk/cdk_stack.py
CHANGED
@@ -39,7 +39,10 @@ from cdk_config import (
|
|
39 |
CLUSTER_NAME,
|
40 |
CODEBUILD_PROJECT_NAME,
|
41 |
CODEBUILD_ROLE_NAME,
|
|
|
|
|
42 |
COGNITO_REDIRECTION_URL,
|
|
|
43 |
COGNITO_USER_POOL_CLIENT_NAME,
|
44 |
COGNITO_USER_POOL_CLIENT_SECRET_NAME,
|
45 |
COGNITO_USER_POOL_DOMAIN_PREFIX,
|
@@ -1161,6 +1164,13 @@ class CdkStack(Stack):
|
|
1161 |
],
|
1162 |
callback_urls=redirect_uris,
|
1163 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1164 |
)
|
1165 |
|
1166 |
CfnOutput(
|
|
|
39 |
CLUSTER_NAME,
|
40 |
CODEBUILD_PROJECT_NAME,
|
41 |
CODEBUILD_ROLE_NAME,
|
42 |
+
COGNITO_ACCESS_TOKEN_VALIDITY,
|
43 |
+
COGNITO_ID_TOKEN_VALIDITY,
|
44 |
COGNITO_REDIRECTION_URL,
|
45 |
+
COGNITO_REFRESH_TOKEN_VALIDITY,
|
46 |
COGNITO_USER_POOL_CLIENT_NAME,
|
47 |
COGNITO_USER_POOL_CLIENT_SECRET_NAME,
|
48 |
COGNITO_USER_POOL_DOMAIN_PREFIX,
|
|
|
1164 |
],
|
1165 |
callback_urls=redirect_uris,
|
1166 |
),
|
1167 |
+
refresh_token_validity=Duration.minutes(
|
1168 |
+
COGNITO_REFRESH_TOKEN_VALIDITY
|
1169 |
+
),
|
1170 |
+
id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
|
1171 |
+
access_token_validity=Duration.minutes(
|
1172 |
+
COGNITO_ACCESS_TOKEN_VALIDITY
|
1173 |
+
),
|
1174 |
)
|
1175 |
|
1176 |
CfnOutput(
|
pyproject.toml
CHANGED
@@ -12,33 +12,33 @@ requires-python = ">=3.10"
|
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20250506",
|
14 |
"pdf2image==1.17.0",
|
15 |
-
"pymupdf==1.26.
|
16 |
"opencv-python==4.12.0.88",
|
17 |
-
"presidio_analyzer==2.2.
|
18 |
-
"presidio_anonymizer==2.2.
|
19 |
"presidio-image-redactor==0.0.57",
|
20 |
-
"pikepdf==9.
|
21 |
-
"pandas==2.3.
|
22 |
-
"scikit-learn==1.7.
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
-
"boto3==1.40.
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
-
"Faker==37.
|
31 |
"python-levenshtein==0.27.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
-
"rapidfuzz==3.
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
"python-docx==1.2.0",
|
39 |
-
"paddlepaddle==3.1.0",
|
40 |
-
"paddleocr==3.1.1",
|
41 |
"polars==1.33.1"
|
|
|
|
|
42 |
]
|
43 |
|
44 |
[project.urls]
|
|
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20250506",
|
14 |
"pdf2image==1.17.0",
|
15 |
+
"pymupdf==1.26.4",
|
16 |
"opencv-python==4.12.0.88",
|
17 |
+
"presidio_analyzer==2.2.360",
|
18 |
+
"presidio_anonymizer==2.2.360",
|
19 |
"presidio-image-redactor==0.0.57",
|
20 |
+
"pikepdf==9.11.0",
|
21 |
+
"pandas==2.3.2",
|
22 |
+
"scikit-learn==1.7.2",
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.47.0",
|
27 |
+
"boto3==1.40.37",
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
+
"Faker==37.8.0",
|
31 |
"python-levenshtein==0.27.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
+
"rapidfuzz==3.14.1",
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
"python-docx==1.2.0",
|
|
|
|
|
39 |
"polars==1.33.1"
|
40 |
+
#"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
|
41 |
+
#"paddleocr==3.2.0"
|
42 |
]
|
43 |
|
44 |
[project.urls]
|
requirements.txt
CHANGED
@@ -1,32 +1,31 @@
|
|
1 |
pdfminer.six==20250506
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.26.
|
4 |
opencv-python==4.12.0.88
|
5 |
-
presidio_analyzer==2.2.
|
6 |
-
presidio_anonymizer==2.2.
|
7 |
presidio-image-redactor==0.0.57
|
8 |
-
pikepdf==9.
|
9 |
pandas==2.3.2
|
10 |
-
scikit-learn==1.7.
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
polars==1.33.1
|
15 |
-
boto3==1.40.
|
16 |
pyarrow==21.0.0
|
17 |
openpyxl==3.1.5
|
18 |
-
Faker==37.
|
19 |
python-levenshtein==0.27.1
|
20 |
spaczz==0.6.1
|
21 |
-
# The following version
|
22 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
23 |
-
rapidfuzz==3.
|
24 |
python-dotenv==1.0.1
|
25 |
awslambdaric==3.1.1
|
26 |
python-docx==1.2.0
|
27 |
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
28 |
-
# paddlepaddle==3.
|
29 |
-
# paddleocr==3.
|
30 |
|
31 |
# Test dependencies
|
32 |
pytest>=7.0.0
|
|
|
1 |
pdfminer.six==20250506
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.26.4
|
4 |
opencv-python==4.12.0.88
|
5 |
+
presidio_analyzer==2.2.360
|
6 |
+
presidio_anonymizer==2.2.360
|
7 |
presidio-image-redactor==0.0.57
|
8 |
+
pikepdf==9.11.0
|
9 |
pandas==2.3.2
|
10 |
+
scikit-learn==1.7.2
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.47.0
|
14 |
polars==1.33.1
|
15 |
+
boto3==1.40.37
|
16 |
pyarrow==21.0.0
|
17 |
openpyxl==3.1.5
|
18 |
+
Faker==37.8.0
|
19 |
python-levenshtein==0.27.1
|
20 |
spaczz==0.6.1
|
|
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
+
rapidfuzz==3.14.1
|
23 |
python-dotenv==1.0.1
|
24 |
awslambdaric==3.1.1
|
25 |
python-docx==1.2.0
|
26 |
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
27 |
+
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
28 |
+
# paddleocr==3.2.0
|
29 |
|
30 |
# Test dependencies
|
31 |
pytest>=7.0.0
|
tools/find_duplicate_pages.py
CHANGED
@@ -36,7 +36,7 @@ def split_text_with_punctuation(text: str) -> List[str]:
|
|
36 |
# 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
|
37 |
pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
|
38 |
|
39 |
-
final_list =
|
40 |
# We first split by whitespace to handle sentences correctly
|
41 |
for word in text.split():
|
42 |
# Then, for each whitespace-separated word, we tokenize it further
|
@@ -381,7 +381,7 @@ def combine_ocr_dataframes(
|
|
381 |
- The final combined and processed DataFrame.
|
382 |
- A list containing the path to the saved output CSV file.
|
383 |
"""
|
384 |
-
all_data =
|
385 |
|
386 |
for file_identifier, df_initial in input_data:
|
387 |
df = df_initial.copy() # Work on a copy to avoid side effects
|
@@ -454,7 +454,7 @@ def combine_ocr_dataframes(
|
|
454 |
combined_df = combined_df.copy()[existing_final_columns]
|
455 |
|
456 |
# --- Save Output ---
|
457 |
-
output_files =
|
458 |
if output_folder and output_filename:
|
459 |
os.makedirs(output_folder, exist_ok=True)
|
460 |
output_path = os.path.join(output_folder, output_filename)
|
@@ -490,7 +490,7 @@ def combine_ocr_output_text(
|
|
490 |
else:
|
491 |
file_paths_list = input_files
|
492 |
|
493 |
-
data_to_process =
|
494 |
for file_path in file_paths_list:
|
495 |
try:
|
496 |
df = pd.read_csv(file_path)
|
@@ -644,7 +644,7 @@ def save_results_and_redaction_lists(
|
|
644 |
Returns:
|
645 |
list: A list of paths to all generated files.
|
646 |
"""
|
647 |
-
output_paths =
|
648 |
output_folder_path = Path(output_folder)
|
649 |
output_folder_path.mkdir(exist_ok=True)
|
650 |
|
@@ -657,7 +657,6 @@ def save_results_and_redaction_lists(
|
|
657 |
final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
|
658 |
|
659 |
output_paths.append(str(similarity_file_output_path))
|
660 |
-
# print(f"Main results saved to {similarity_file_output_path}")
|
661 |
|
662 |
# 2. Save per-file redaction lists
|
663 |
# Use 'Page2_File' as the source of duplicate content
|
@@ -754,7 +753,6 @@ def find_consecutive_sequence_matches(
|
|
754 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
755 |
consecutive match, or an empty DataFrame if no match is found.
|
756 |
"""
|
757 |
-
# print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
|
758 |
|
759 |
# Step 1: Isolate the data for each file
|
760 |
search_df = df_filtered[df_filtered["file"] == search_file_name]
|
@@ -773,7 +771,7 @@ def find_consecutive_sequence_matches(
|
|
773 |
reference_indices = reference_df.index.tolist()
|
774 |
|
775 |
query_len = len(query_tokens)
|
776 |
-
all_found_matches =
|
777 |
|
778 |
print(f"Searching for a sequence of {query_len} tokens...")
|
779 |
|
@@ -784,7 +782,6 @@ def find_consecutive_sequence_matches(
|
|
784 |
|
785 |
# Step 4: If the window matches the query with or without punctuation on end
|
786 |
if _sequences_match(query_tokens, window):
|
787 |
-
# print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
|
788 |
|
789 |
# Get the global indices for this entire matching block
|
790 |
matching_reference_indices = reference_indices[i : i + query_len]
|
@@ -874,7 +871,7 @@ def identify_similar_text_sequences(
|
|
874 |
# Use the original, simpler path for all-to-all comparisons (including intra-file).
|
875 |
vectorizer = TfidfVectorizer()
|
876 |
print("Standard Path: Calculating all-to-all similarity.")
|
877 |
-
progress(0.2, desc="
|
878 |
tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
|
879 |
|
880 |
progress(0.3, desc="Calculating similarity matrix...")
|
@@ -897,12 +894,8 @@ def identify_similar_text_sequences(
|
|
897 |
progress(0.7, desc="Aggregating results based on matching strategy")
|
898 |
|
899 |
if greedy_match or min_consecutive_pages > 1:
|
900 |
-
# print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
|
901 |
-
|
902 |
# Sort the dataframe to ensure consecutive pages are adjacent
|
903 |
-
similarity_df =
|
904 |
-
base_similarity_df # .sort_values(['Page1_Index', 'Page2_Index']).copy()
|
905 |
-
)
|
906 |
|
907 |
# A new sequence starts if the difference from the previous row is not (1, 1)
|
908 |
# is_consecutive will be True if a row continues the sequence, False if it's a new one.
|
@@ -1023,8 +1016,8 @@ def run_duplicate_analysis(
|
|
1023 |
min_consecutive: int,
|
1024 |
greedy_match: bool,
|
1025 |
combine_pages: bool = True,
|
1026 |
-
preview_length: int = 500,
|
1027 |
output_folder: str = OUTPUT_FOLDER,
|
|
|
1028 |
progress=gr.Progress(track_tqdm=True),
|
1029 |
):
|
1030 |
"""
|
@@ -1039,8 +1032,8 @@ def run_duplicate_analysis(
|
|
1039 |
min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
|
1040 |
greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
|
1041 |
combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
|
1042 |
-
preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
|
1043 |
output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
|
|
|
1044 |
progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
|
1045 |
"""
|
1046 |
|
@@ -1079,8 +1072,11 @@ def run_duplicate_analysis(
|
|
1079 |
progress=progress,
|
1080 |
)
|
1081 |
|
|
|
|
|
1082 |
# Clip text to first 200 characters
|
1083 |
full_df["text"] = full_df["text"].str[:preview_length]
|
|
|
1084 |
# Preprocess full_data (without preview text) for fast access (run once)
|
1085 |
full_data_by_file = {
|
1086 |
file: df.sort_values("page").set_index("page")
|
@@ -1135,6 +1131,9 @@ def show_page_previews(
|
|
1135 |
page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
|
1136 |
page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
|
1137 |
|
|
|
|
|
|
|
1138 |
page1_data["text"] = page1_data["text"].str[:preview_length]
|
1139 |
page2_data["text"] = page2_data["text"].str[:preview_length]
|
1140 |
|
@@ -1217,10 +1216,10 @@ def apply_whole_page_redactions_from_list(
|
|
1217 |
new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
|
1218 |
"""
|
1219 |
if all_existing_annotations is None:
|
1220 |
-
all_existing_annotations =
|
1221 |
|
1222 |
if new_annotations_with_bounding_boxes is None:
|
1223 |
-
new_annotations_with_bounding_boxes =
|
1224 |
|
1225 |
all_annotations = all_existing_annotations.copy()
|
1226 |
|
@@ -1229,7 +1228,7 @@ def apply_whole_page_redactions_from_list(
|
|
1229 |
print(f"Warning: {message}")
|
1230 |
raise Warning(message)
|
1231 |
|
1232 |
-
list_whole_pages_to_redact =
|
1233 |
|
1234 |
if combine_pages is True:
|
1235 |
# Get list of pages to redact from either dataframe or file
|
@@ -1270,7 +1269,7 @@ def apply_whole_page_redactions_from_list(
|
|
1270 |
print(message)
|
1271 |
raise Warning(message)
|
1272 |
|
1273 |
-
list_whole_pages_to_redact =
|
1274 |
for annotation in new_annotations_with_bounding_boxes:
|
1275 |
from tools.secure_regex_utils import safe_extract_page_number_from_path
|
1276 |
|
@@ -1285,7 +1284,7 @@ def apply_whole_page_redactions_from_list(
|
|
1285 |
|
1286 |
list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
|
1287 |
|
1288 |
-
new_annotations =
|
1289 |
# Process each page for redaction
|
1290 |
for page in list_whole_pages_to_redact:
|
1291 |
try:
|
@@ -1312,8 +1311,7 @@ def apply_whole_page_redactions_from_list(
|
|
1312 |
)
|
1313 |
continue
|
1314 |
|
1315 |
-
|
1316 |
-
boxes_to_add = []
|
1317 |
|
1318 |
pymupdf_page = pymupdf_doc[page_index]
|
1319 |
|
@@ -1439,7 +1437,7 @@ def create_annotation_objects_from_duplicates(
|
|
1439 |
Returns:
|
1440 |
List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
|
1441 |
"""
|
1442 |
-
final_output =
|
1443 |
|
1444 |
if duplicates_df.empty:
|
1445 |
raise Warning("No duplicates found")
|
@@ -1503,7 +1501,7 @@ def create_annotation_objects_from_duplicates(
|
|
1503 |
annotations_by_page[page_number].append(box)
|
1504 |
|
1505 |
# --- Format the final output list using the page-to-image map ---
|
1506 |
-
final_output =
|
1507 |
# Sort by page number for a predictable order
|
1508 |
for page_num, boxes in sorted(annotations_by_page.items()):
|
1509 |
# Look up the image path using the page number
|
|
|
36 |
# 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
|
37 |
pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
|
38 |
|
39 |
+
final_list = list()
|
40 |
# We first split by whitespace to handle sentences correctly
|
41 |
for word in text.split():
|
42 |
# Then, for each whitespace-separated word, we tokenize it further
|
|
|
381 |
- The final combined and processed DataFrame.
|
382 |
- A list containing the path to the saved output CSV file.
|
383 |
"""
|
384 |
+
all_data = list()
|
385 |
|
386 |
for file_identifier, df_initial in input_data:
|
387 |
df = df_initial.copy() # Work on a copy to avoid side effects
|
|
|
454 |
combined_df = combined_df.copy()[existing_final_columns]
|
455 |
|
456 |
# --- Save Output ---
|
457 |
+
output_files = list()
|
458 |
if output_folder and output_filename:
|
459 |
os.makedirs(output_folder, exist_ok=True)
|
460 |
output_path = os.path.join(output_folder, output_filename)
|
|
|
490 |
else:
|
491 |
file_paths_list = input_files
|
492 |
|
493 |
+
data_to_process = list()
|
494 |
for file_path in file_paths_list:
|
495 |
try:
|
496 |
df = pd.read_csv(file_path)
|
|
|
644 |
Returns:
|
645 |
list: A list of paths to all generated files.
|
646 |
"""
|
647 |
+
output_paths = list()
|
648 |
output_folder_path = Path(output_folder)
|
649 |
output_folder_path.mkdir(exist_ok=True)
|
650 |
|
|
|
657 |
final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
|
658 |
|
659 |
output_paths.append(str(similarity_file_output_path))
|
|
|
660 |
|
661 |
# 2. Save per-file redaction lists
|
662 |
# Use 'Page2_File' as the source of duplicate content
|
|
|
753 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
754 |
consecutive match, or an empty DataFrame if no match is found.
|
755 |
"""
|
|
|
756 |
|
757 |
# Step 1: Isolate the data for each file
|
758 |
search_df = df_filtered[df_filtered["file"] == search_file_name]
|
|
|
771 |
reference_indices = reference_df.index.tolist()
|
772 |
|
773 |
query_len = len(query_tokens)
|
774 |
+
all_found_matches = list()
|
775 |
|
776 |
print(f"Searching for a sequence of {query_len} tokens...")
|
777 |
|
|
|
782 |
|
783 |
# Step 4: If the window matches the query with or without punctuation on end
|
784 |
if _sequences_match(query_tokens, window):
|
|
|
785 |
|
786 |
# Get the global indices for this entire matching block
|
787 |
matching_reference_indices = reference_indices[i : i + query_len]
|
|
|
871 |
# Use the original, simpler path for all-to-all comparisons (including intra-file).
|
872 |
vectorizer = TfidfVectorizer()
|
873 |
print("Standard Path: Calculating all-to-all similarity.")
|
874 |
+
progress(0.2, desc="Vectorising text...")
|
875 |
tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
|
876 |
|
877 |
progress(0.3, desc="Calculating similarity matrix...")
|
|
|
894 |
progress(0.7, desc="Aggregating results based on matching strategy")
|
895 |
|
896 |
if greedy_match or min_consecutive_pages > 1:
|
|
|
|
|
897 |
# Sort the dataframe to ensure consecutive pages are adjacent
|
898 |
+
similarity_df = base_similarity_df
|
|
|
|
|
899 |
|
900 |
# A new sequence starts if the difference from the previous row is not (1, 1)
|
901 |
# is_consecutive will be True if a row continues the sequence, False if it's a new one.
|
|
|
1016 |
min_consecutive: int,
|
1017 |
greedy_match: bool,
|
1018 |
combine_pages: bool = True,
|
|
|
1019 |
output_folder: str = OUTPUT_FOLDER,
|
1020 |
+
preview_length: int = 500,
|
1021 |
progress=gr.Progress(track_tqdm=True),
|
1022 |
):
|
1023 |
"""
|
|
|
1032 |
min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
|
1033 |
greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
|
1034 |
combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
|
|
|
1035 |
output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
|
1036 |
+
preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
|
1037 |
progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
|
1038 |
"""
|
1039 |
|
|
|
1072 |
progress=progress,
|
1073 |
)
|
1074 |
|
1075 |
+
full_df["text"] = full_df["text"].astype(str)
|
1076 |
+
|
1077 |
# Clip text to first 200 characters
|
1078 |
full_df["text"] = full_df["text"].str[:preview_length]
|
1079 |
+
|
1080 |
# Preprocess full_data (without preview text) for fast access (run once)
|
1081 |
full_data_by_file = {
|
1082 |
file: df.sort_values("page").set_index("page")
|
|
|
1131 |
page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
|
1132 |
page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
|
1133 |
|
1134 |
+
page1_data["text"] = page1_data["text"].astype(str)
|
1135 |
+
page2_data["text"] = page2_data["text"].astype(str)
|
1136 |
+
|
1137 |
page1_data["text"] = page1_data["text"].str[:preview_length]
|
1138 |
page2_data["text"] = page2_data["text"].str[:preview_length]
|
1139 |
|
|
|
1216 |
new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
|
1217 |
"""
|
1218 |
if all_existing_annotations is None:
|
1219 |
+
all_existing_annotations = list()
|
1220 |
|
1221 |
if new_annotations_with_bounding_boxes is None:
|
1222 |
+
new_annotations_with_bounding_boxes = list()
|
1223 |
|
1224 |
all_annotations = all_existing_annotations.copy()
|
1225 |
|
|
|
1228 |
print(f"Warning: {message}")
|
1229 |
raise Warning(message)
|
1230 |
|
1231 |
+
list_whole_pages_to_redact = list()
|
1232 |
|
1233 |
if combine_pages is True:
|
1234 |
# Get list of pages to redact from either dataframe or file
|
|
|
1269 |
print(message)
|
1270 |
raise Warning(message)
|
1271 |
|
1272 |
+
list_whole_pages_to_redact = list()
|
1273 |
for annotation in new_annotations_with_bounding_boxes:
|
1274 |
from tools.secure_regex_utils import safe_extract_page_number_from_path
|
1275 |
|
|
|
1284 |
|
1285 |
list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
|
1286 |
|
1287 |
+
new_annotations = list()
|
1288 |
# Process each page for redaction
|
1289 |
for page in list_whole_pages_to_redact:
|
1290 |
try:
|
|
|
1311 |
)
|
1312 |
continue
|
1313 |
|
1314 |
+
boxes_to_add = list()
|
|
|
1315 |
|
1316 |
pymupdf_page = pymupdf_doc[page_index]
|
1317 |
|
|
|
1437 |
Returns:
|
1438 |
List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
|
1439 |
"""
|
1440 |
+
final_output = list()
|
1441 |
|
1442 |
if duplicates_df.empty:
|
1443 |
raise Warning("No duplicates found")
|
|
|
1501 |
annotations_by_page[page_number].append(box)
|
1502 |
|
1503 |
# --- Format the final output list using the page-to-image map ---
|
1504 |
+
final_output = list()
|
1505 |
# Sort by page number for a predictable order
|
1506 |
for page_num, boxes in sorted(annotations_by_page.items()):
|
1507 |
# Look up the image path using the page number
|
tools/redaction_review.py
CHANGED
@@ -1017,8 +1017,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
1017 |
|
1018 |
progress(1.0, desc="Completed annotation processing")
|
1019 |
|
1020 |
-
print("final_annotations_list:", final_annotations_list)
|
1021 |
-
|
1022 |
return (
|
1023 |
final_annotations_list,
|
1024 |
existing_annotations_list,
|
|
|
1017 |
|
1018 |
progress(1.0, desc="Completed annotation processing")
|
1019 |
|
|
|
|
|
1020 |
return (
|
1021 |
final_annotations_list,
|
1022 |
existing_annotations_list,
|