Merge pull request #8 from seanpedrick-case/dev
Browse filesFixed csv/xlsx redaction.
Updated guide on creating exe.
Corrected image coordinate translation when the pdf mediabox is not the same size as pdf page rectangle
Fixed issues with gradio version 5.16.
Fixed fuzzy search error with pages with no data.
Added git to Dockerfile to be able to install git-based custom gradio components
- DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec +18 -4
- Dockerfile +2 -1
- app.py +1 -1
- how_to_create_exe_dist.txt +24 -6
- requirements.txt +5 -2
- tools/data_anonymise.py +6 -0
- tools/file_conversion.py +124 -27
- tools/file_redaction.py +11 -4
- tools/load_spacy_model_custom_recognisers.py +8 -8
- tools/redaction_review.py +10 -5
DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec
RENAMED
@@ -1,17 +1,31 @@
|
|
1 |
# -*- mode: python ; coding: utf-8 -*-
|
2 |
from PyInstaller.utils.hooks import collect_data_files
|
|
|
3 |
|
4 |
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
|
|
|
|
5 |
datas += collect_data_files('gradio_client')
|
6 |
datas += collect_data_files('gradio')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
a = Analysis(
|
10 |
['app.py'],
|
11 |
pathex=[],
|
12 |
-
binaries=
|
13 |
datas=datas,
|
14 |
-
hiddenimports=
|
15 |
hookspath=['build_deps'],
|
16 |
hooksconfig={},
|
17 |
runtime_hooks=[],
|
@@ -29,7 +43,7 @@ exe = EXE(
|
|
29 |
a.scripts,
|
30 |
[],
|
31 |
exclude_binaries=True,
|
32 |
-
name='DocRedactApp_0.2',
|
33 |
debug=False,
|
34 |
bootloader_ignore_signals=False,
|
35 |
strip=False,
|
@@ -48,5 +62,5 @@ coll = COLLECT(
|
|
48 |
strip=False,
|
49 |
upx=True,
|
50 |
upx_exclude=[],
|
51 |
-
name='DocRedactApp_0.2',
|
52 |
)
|
|
|
1 |
# -*- mode: python ; coding: utf-8 -*-
|
2 |
from PyInstaller.utils.hooks import collect_data_files
|
3 |
+
from PyInstaller.utils.hooks import collect_all
|
4 |
|
5 |
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
6 |
+
binaries = []
|
7 |
+
hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
|
8 |
datas += collect_data_files('gradio_client')
|
9 |
datas += collect_data_files('gradio')
|
10 |
+
datas += collect_data_files('gradio_image_annotation')
|
11 |
+
tmp_ret = collect_all('gradio_image_annotation')
|
12 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
13 |
+
tmp_ret = collect_all('safehttpx')
|
14 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
15 |
+
tmp_ret = collect_all('presidio_analyzer')
|
16 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
17 |
+
tmp_ret = collect_all('presidio_anonymizer')
|
18 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
19 |
+
tmp_ret = collect_all('presidio_image_redactor')
|
20 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
21 |
|
22 |
|
23 |
a = Analysis(
|
24 |
['app.py'],
|
25 |
pathex=[],
|
26 |
+
binaries=binaries,
|
27 |
datas=datas,
|
28 |
+
hiddenimports=hiddenimports,
|
29 |
hookspath=['build_deps'],
|
30 |
hooksconfig={},
|
31 |
runtime_hooks=[],
|
|
|
43 |
a.scripts,
|
44 |
[],
|
45 |
exclude_binaries=True,
|
46 |
+
name='DocRedactApp_0.2.0',
|
47 |
debug=False,
|
48 |
bootloader_ignore_signals=False,
|
49 |
strip=False,
|
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
+
name='DocRedactApp_0.2.0',
|
66 |
)
|
Dockerfile
CHANGED
@@ -8,7 +8,8 @@ RUN apt-get update \
|
|
8 |
make \
|
9 |
cmake \
|
10 |
unzip \
|
11 |
-
libcurl4-openssl-dev \
|
|
|
12 |
&& apt-get clean \
|
13 |
&& rm -rf /var/lib/apt/lists/*
|
14 |
|
|
|
8 |
make \
|
9 |
cmake \
|
10 |
unzip \
|
11 |
+
libcurl4-openssl-dev \
|
12 |
+
git \
|
13 |
&& apt-get clean \
|
14 |
&& rm -rf /var/lib/apt/lists/*
|
15 |
|
app.py
CHANGED
@@ -453,7 +453,7 @@ with app:
|
|
453 |
# TABULAR DATA REDACTION
|
454 |
###
|
455 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
456 |
-
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[
|
457 |
|
458 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
459 |
|
|
|
453 |
# TABULAR DATA REDACTION
|
454 |
###
|
455 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
456 |
+
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
457 |
|
458 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
459 |
|
how_to_create_exe_dist.txt
CHANGED
@@ -12,9 +12,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
12 |
|
13 |
8. In command line, cd to the folder that contains app.py.
|
14 |
|
15 |
-
9.Run the following
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -28,11 +28,29 @@ a = Analysis(
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
-
|
32 |
|
|
|
33 |
|
34 |
-
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
35 |
|
36 |
-
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
8. In command line, cd to the folder that contains app.py.
|
14 |
|
15 |
+
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
+
hook-presidio-image-redactor.py
|
32 |
|
33 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
|
34 |
|
|
|
35 |
|
36 |
+
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
37 |
|
38 |
+
10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
|
39 |
+
|
40 |
+
def create_or_modify_pyi(
|
41 |
+
component_class: type, class_name: str, events: list[str | EventListener]
|
42 |
+
):
|
43 |
+
source_file = Path(inspect.getfile(component_class))
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Try to read the source file
|
47 |
+
source_code = source_file.read_text(encoding="utf-8")
|
48 |
+
except FileNotFoundError:
|
49 |
+
# If source file not found, skip pyi generation
|
50 |
+
return None
|
51 |
+
|
52 |
+
11. Copy the poppler and tesseract folders into the location where the .exe is
|
53 |
+
|
54 |
+
12. In 'dist\redaction' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
|
55 |
+
|
56 |
+
12. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
|
requirements.txt
CHANGED
@@ -12,14 +12,17 @@ scikit-learn==1.5.2
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.
|
16 |
-
boto3==1.
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
19 |
Faker==22.2.0
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
gradio_image_annotation==0.2.5
|
|
|
|
|
|
|
23 |
numpy==1.26.4
|
24 |
awslambdaric==3.0.0
|
25 |
|
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
+
gradio==5.16.0
|
16 |
+
boto3==1.36.15
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
19 |
Faker==22.2.0
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
gradio_image_annotation==0.2.5
|
23 |
+
# The following version includes rotation and image zoom options - not currently working so reverting to original until fixed
|
24 |
+
#git+https://github.com/seanpedrick-case/gradio_image_annotator
|
25 |
+
rapidfuzz==3.12.1
|
26 |
numpy==1.26.4
|
27 |
awslambdaric==3.0.0
|
28 |
|
tools/data_anonymise.py
CHANGED
@@ -389,6 +389,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
389 |
if isinstance(out_message, str):
|
390 |
out_message = [out_message]
|
391 |
|
|
|
|
|
|
|
|
|
|
|
392 |
if not out_file_paths:
|
393 |
out_file_paths = []
|
394 |
|
@@ -473,6 +478,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
473 |
sheet_name = ""
|
474 |
anon_df = read_file(anon_file)
|
475 |
out_file_part = get_file_name_without_type(anon_file.name)
|
|
|
476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
477 |
|
478 |
# Increase latest file completed count unless we are at the last file
|
|
|
389 |
if isinstance(out_message, str):
|
390 |
out_message = [out_message]
|
391 |
|
392 |
+
print("log_files_output_paths:",log_files_output_paths)
|
393 |
+
|
394 |
+
if isinstance(log_files_output_paths, str):
|
395 |
+
log_files_output_paths = []
|
396 |
+
|
397 |
if not out_file_paths:
|
398 |
out_file_paths = []
|
399 |
|
|
|
478 |
sheet_name = ""
|
479 |
anon_df = read_file(anon_file)
|
480 |
out_file_part = get_file_name_without_type(anon_file.name)
|
481 |
+
|
482 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
483 |
|
484 |
# Increase latest file completed count unless we are at the last file
|
tools/file_conversion.py
CHANGED
@@ -304,44 +304,138 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
|
|
304 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
305 |
shape.commit()
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
308 |
'''
|
309 |
Converts coordinates from pymupdf format to image coordinates,
|
310 |
-
accounting for mediabox dimensions.
|
311 |
'''
|
|
|
|
|
|
|
|
|
312 |
|
313 |
-
|
314 |
-
rect_width = pymupdf_page.rect.width
|
315 |
-
|
316 |
-
# Get mediabox dimensions
|
317 |
mediabox = pymupdf_page.mediabox
|
318 |
mediabox_width = mediabox.width
|
319 |
mediabox_height = mediabox.height
|
320 |
|
|
|
321 |
image_page_width, image_page_height = image.size
|
322 |
|
323 |
-
# Calculate scaling factors
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
326 |
|
327 |
-
#
|
328 |
-
#
|
|
|
329 |
|
330 |
-
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
-
|
334 |
-
|
335 |
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
return x1_image, y1_image, x2_image, y2_image
|
343 |
|
344 |
|
|
|
345 |
def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
|
346 |
# Small border to page that remains white
|
347 |
border = 5
|
@@ -598,13 +692,16 @@ def prepare_image_or_pdf(
|
|
598 |
all_annotations_object.append(annotation)
|
599 |
|
600 |
#print("annotation:", annotation, "for page:", str(i))
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
|
|
|
|
|
|
608 |
#print("Annotation page number:", annotation_page_number)
|
609 |
|
610 |
# Check if the annotation page number exists in the image file paths pages
|
@@ -744,7 +841,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
744 |
#print(number) # Output: 0
|
745 |
reported_number = int(number) + 1
|
746 |
else:
|
747 |
-
print("No number found before .png")
|
748 |
reported_number = 1
|
749 |
|
750 |
# Check if 'boxes' is in the annotation, if not, add an empty list
|
|
|
304 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
305 |
shape.commit()
|
306 |
|
307 |
+
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
308 |
+
# '''
|
309 |
+
# Converts coordinates from pymupdf format to image coordinates,
|
310 |
+
# accounting for mediabox dimensions and offset.
|
311 |
+
# '''
|
312 |
+
# # Get rect dimensions
|
313 |
+
# rect = pymupdf_page.rect
|
314 |
+
# rect_width = rect.width
|
315 |
+
# rect_height = rect.height
|
316 |
+
|
317 |
+
# # Get mediabox dimensions and position
|
318 |
+
# mediabox = pymupdf_page.mediabox
|
319 |
+
# mediabox_width = mediabox.width
|
320 |
+
# mediabox_height = mediabox.height
|
321 |
+
|
322 |
+
# # Get target image dimensions
|
323 |
+
# image_page_width, image_page_height = image.size
|
324 |
+
|
325 |
+
# # Calculate scaling factors
|
326 |
+
# image_to_mediabox_x_scale = image_page_width / mediabox_width
|
327 |
+
# image_to_mediabox_y_scale = image_page_height / mediabox_height
|
328 |
+
|
329 |
+
# image_to_rect_scale_width = image_page_width / rect_width
|
330 |
+
# image_to_rect_scale_height = image_page_height / rect_height
|
331 |
+
|
332 |
+
# # Adjust for offsets (difference in position between mediabox and rect)
|
333 |
+
# x_offset = rect.x0 - mediabox.x0 # Difference in x position
|
334 |
+
# y_offset = rect.y0 - mediabox.y0 # Difference in y position
|
335 |
+
|
336 |
+
# print("x_offset:", x_offset)
|
337 |
+
# print("y_offset:", y_offset)
|
338 |
+
|
339 |
+
# # Adjust coordinates:
|
340 |
+
# # Apply scaling to match image dimensions
|
341 |
+
# x1_image = x1 * image_to_mediabox_x_scale
|
342 |
+
# x2_image = x2 * image_to_mediabox_x_scale
|
343 |
+
# y1_image = y1 * image_to_mediabox_y_scale
|
344 |
+
# y2_image = y2 * image_to_mediabox_y_scale
|
345 |
+
|
346 |
+
# # Correct for difference in rect and mediabox size
|
347 |
+
# if mediabox_width != rect_width:
|
348 |
+
|
349 |
+
# mediabox_to_rect_x_scale = mediabox_width / rect_width
|
350 |
+
# mediabox_to_rect_y_scale = mediabox_height / rect_height
|
351 |
+
|
352 |
+
# x1_image *= mediabox_to_rect_x_scale
|
353 |
+
# x2_image *= mediabox_to_rect_x_scale
|
354 |
+
# y1_image *= mediabox_to_rect_y_scale
|
355 |
+
# y2_image *= mediabox_to_rect_y_scale
|
356 |
+
|
357 |
+
# print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
|
358 |
+
# #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
|
359 |
+
|
360 |
+
# print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
|
361 |
+
# #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
|
362 |
+
|
363 |
+
# mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
|
364 |
+
# mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
|
365 |
+
|
366 |
+
# x1_image -= mediabox_rect_x_diff
|
367 |
+
# x2_image -= mediabox_rect_x_diff
|
368 |
+
# y1_image += mediabox_rect_y_diff
|
369 |
+
# y2_image += mediabox_rect_y_diff
|
370 |
+
|
371 |
+
# return x1_image, y1_image, x2_image, y2_image
|
372 |
+
|
373 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
374 |
'''
|
375 |
Converts coordinates from pymupdf format to image coordinates,
|
376 |
+
accounting for mediabox dimensions and offset.
|
377 |
'''
|
378 |
+
# Get rect dimensions
|
379 |
+
rect = pymupdf_page.rect
|
380 |
+
rect_width = rect.width
|
381 |
+
rect_height = rect.height
|
382 |
|
383 |
+
# Get mediabox dimensions and position
|
|
|
|
|
|
|
384 |
mediabox = pymupdf_page.mediabox
|
385 |
mediabox_width = mediabox.width
|
386 |
mediabox_height = mediabox.height
|
387 |
|
388 |
+
# Get target image dimensions
|
389 |
image_page_width, image_page_height = image.size
|
390 |
|
391 |
+
# Calculate scaling factors
|
392 |
+
image_to_mediabox_x_scale = image_page_width / mediabox_width
|
393 |
+
image_to_mediabox_y_scale = image_page_height / mediabox_height
|
394 |
+
|
395 |
+
image_to_rect_scale_width = image_page_width / rect_width
|
396 |
+
image_to_rect_scale_height = image_page_height / rect_height
|
397 |
|
398 |
+
# Adjust for offsets (difference in position between mediabox and rect)
|
399 |
+
x_offset = rect.x0 - mediabox.x0 # Difference in x position
|
400 |
+
y_offset = rect.y0 - mediabox.y0 # Difference in y position
|
401 |
|
402 |
+
#print("x_offset:", x_offset)
|
403 |
+
#print("y_offset:", y_offset)
|
404 |
+
|
405 |
+
# Adjust coordinates:
|
406 |
+
# Apply scaling to match image dimensions
|
407 |
+
x1_image = x1 * image_to_mediabox_x_scale
|
408 |
+
x2_image = x2 * image_to_mediabox_x_scale
|
409 |
+
y1_image = y1 * image_to_mediabox_y_scale
|
410 |
+
y2_image = y2 * image_to_mediabox_y_scale
|
411 |
+
|
412 |
+
# Correct for difference in rect and mediabox size
|
413 |
+
if mediabox_width != rect_width:
|
414 |
+
|
415 |
+
mediabox_to_rect_x_scale = mediabox_width / rect_width
|
416 |
+
mediabox_to_rect_y_scale = mediabox_height / rect_height
|
417 |
|
418 |
+
rect_to_mediabox_x_scale = rect_width / mediabox_width
|
419 |
+
#rect_to_mediabox_y_scale = rect_height / mediabox_height
|
420 |
|
421 |
+
mediabox_rect_x_diff = (mediabox_width - rect_width) * (image_to_mediabox_x_scale / 2)
|
422 |
+
mediabox_rect_y_diff = (mediabox_height - rect_height) * (image_to_mediabox_y_scale / 2)
|
423 |
+
|
424 |
+
x1_image -= mediabox_rect_x_diff
|
425 |
+
x2_image -= mediabox_rect_x_diff
|
426 |
+
y1_image += mediabox_rect_y_diff
|
427 |
+
y2_image += mediabox_rect_y_diff
|
428 |
+
|
429 |
+
#
|
430 |
+
x1_image *= mediabox_to_rect_x_scale
|
431 |
+
x2_image *= mediabox_to_rect_x_scale
|
432 |
+
y1_image *= mediabox_to_rect_y_scale
|
433 |
+
y2_image *= mediabox_to_rect_y_scale
|
434 |
|
435 |
return x1_image, y1_image, x2_image, y2_image
|
436 |
|
437 |
|
438 |
+
|
439 |
def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
|
440 |
# Small border to page that remains white
|
441 |
border = 5
|
|
|
692 |
all_annotations_object.append(annotation)
|
693 |
|
694 |
#print("annotation:", annotation, "for page:", str(i))
|
695 |
+
try:
|
696 |
+
if not annotation:
|
697 |
+
annotation = {"image":"", "boxes": []}
|
698 |
+
annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
|
699 |
+
|
700 |
+
else:
|
701 |
+
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
702 |
+
except Exception as e:
|
703 |
+
print("Extracting page number from image failed due to:", e)
|
704 |
+
annotation_page_number = 0
|
705 |
#print("Annotation page number:", annotation_page_number)
|
706 |
|
707 |
# Check if the annotation page number exists in the image file paths pages
|
|
|
841 |
#print(number) # Output: 0
|
842 |
reported_number = int(number) + 1
|
843 |
else:
|
844 |
+
print("No number found before .png. Returning page 1.")
|
845 |
reported_number = 1
|
846 |
|
847 |
# Check if 'boxes' is in the annotation, if not, add an empty list
|
tools/file_redaction.py
CHANGED
@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
144 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
145 |
|
146 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
147 |
-
|
|
|
|
|
|
|
|
|
148 |
|
149 |
# Sort the strings in order from the longest string to the shortest
|
150 |
custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
|
151 |
|
152 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
155 |
|
156 |
# If this is the first time around, set variables to 0/blank
|
157 |
if first_loop_state==True:
|
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
|
|
1209 |
|
1210 |
## Apply annotations with pymupdf
|
1211 |
else:
|
1212 |
-
print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1213 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1214 |
if redact_whole_page_list:
|
1215 |
int_reported_page_number = int(reported_page_number)
|
|
|
144 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
145 |
|
146 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
147 |
+
if not custom_recogniser_word_list.empty:
|
148 |
+
custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
|
149 |
+
else:
|
150 |
+
# Handle the case where the DataFrame is empty
|
151 |
+
custom_recogniser_word_list = [] # or some default value
|
152 |
|
153 |
# Sort the strings in order from the longest string to the shortest
|
154 |
custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
|
155 |
|
156 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
157 |
+
if not redact_whole_page_list.empty:
|
158 |
+
redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
|
159 |
+
else:
|
160 |
+
# Handle the case where the DataFrame is empty
|
161 |
+
redact_whole_page_list = [] # or some default value
|
162 |
|
163 |
# If this is the first time around, set variables to 0/blank
|
164 |
if first_loop_state==True:
|
|
|
1216 |
|
1217 |
## Apply annotations with pymupdf
|
1218 |
else:
|
1219 |
+
#print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1220 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1221 |
if redact_whole_page_list:
|
1222 |
int_reported_page_number = int(reported_page_number)
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
184 |
#print("custom_query_list:", custom_query_list)
|
185 |
|
186 |
if not text:
|
187 |
-
out_message = "
|
188 |
print(out_message)
|
189 |
-
return
|
190 |
|
191 |
for string_query in custom_query_list:
|
192 |
|
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
254 |
for match_id, start, end, ratio, pattern in matches:
|
255 |
span = str(doc[start:end]).strip()
|
256 |
query_search = str(query).strip()
|
257 |
-
print("doc:", doc)
|
258 |
-
print("span:", span)
|
259 |
-
print("query_search:", query_search)
|
260 |
|
261 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
262 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
263 |
|
264 |
-
print("Levenshtein distance:", distance)
|
265 |
|
266 |
if distance > spelling_mistakes_max:
|
267 |
match_count = match_count - 1
|
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
270 |
start_char = doc[start].idx # Start character position
|
271 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
272 |
|
273 |
-
print("start_char:", start_char)
|
274 |
-
print("end_char:", end_char)
|
275 |
|
276 |
all_matches.append(match_count)
|
277 |
all_start_positions.append(start_char)
|
|
|
184 |
#print("custom_query_list:", custom_query_list)
|
185 |
|
186 |
if not text:
|
187 |
+
out_message = "No text data found. Skipping page."
|
188 |
print(out_message)
|
189 |
+
return all_start_positions, all_end_positions
|
190 |
|
191 |
for string_query in custom_query_list:
|
192 |
|
|
|
254 |
for match_id, start, end, ratio, pattern in matches:
|
255 |
span = str(doc[start:end]).strip()
|
256 |
query_search = str(query).strip()
|
257 |
+
#print("doc:", doc)
|
258 |
+
#print("span:", span)
|
259 |
+
#print("query_search:", query_search)
|
260 |
|
261 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
262 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
263 |
|
264 |
+
#print("Levenshtein distance:", distance)
|
265 |
|
266 |
if distance > spelling_mistakes_max:
|
267 |
match_count = match_count - 1
|
|
|
270 |
start_char = doc[start].idx # Start character position
|
271 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
272 |
|
273 |
+
#print("start_char:", start_char)
|
274 |
+
#print("end_char:", end_char)
|
275 |
|
276 |
all_matches.append(match_count)
|
277 |
all_start_positions.append(start_char)
|
tools/redaction_review.py
CHANGED
@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
137 |
page_num_reported = 1
|
138 |
|
139 |
out_image_annotator = image_annotator(
|
140 |
-
|
141 |
boxes_alpha=0.1,
|
142 |
box_thickness=1,
|
143 |
label_list=recogniser_entities_list,
|
@@ -295,9 +295,14 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
295 |
fill = img_annotation_box["color"]
|
296 |
|
297 |
draw.rectangle(coords, fill=fill)
|
298 |
-
|
|
|
299 |
image.save(output_folder + file_name_without_ext + "_redacted.png")
|
300 |
|
|
|
|
|
|
|
|
|
301 |
doc = [image]
|
302 |
|
303 |
elif file_extension in '.csv':
|
@@ -347,7 +352,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
347 |
output_files.append(out_pdf_file_path)
|
348 |
|
349 |
else:
|
350 |
-
print("PDF input not found.")
|
351 |
|
352 |
# If save_pdf is not true, then add the original pdf to the output files
|
353 |
else:
|
@@ -500,8 +505,8 @@ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
|
500 |
redact_annot.set('interior-color', colour_str)
|
501 |
#redact_annot.set('fill-color', colour_str)
|
502 |
#redact_annot.set('outline-color', colour_str)
|
503 |
-
redact_annot.set('overlay-color', colour_str)
|
504 |
-
redact_annot.set('overlay-text', row['label'])
|
505 |
redact_annot.set('opacity', "0.5")
|
506 |
|
507 |
# Add appearance dictionary
|
|
|
137 |
page_num_reported = 1
|
138 |
|
139 |
out_image_annotator = image_annotator(
|
140 |
+
None,
|
141 |
boxes_alpha=0.1,
|
142 |
box_thickness=1,
|
143 |
label_list=recogniser_entities_list,
|
|
|
295 |
fill = img_annotation_box["color"]
|
296 |
|
297 |
draw.rectangle(coords, fill=fill)
|
298 |
+
|
299 |
+
output_image_path = output_folder + file_name_without_ext + "_redacted.png"
|
300 |
image.save(output_folder + file_name_without_ext + "_redacted.png")
|
301 |
|
302 |
+
output_files.append(output_image_path)
|
303 |
+
|
304 |
+
print("Redactions saved to image file")
|
305 |
+
|
306 |
doc = [image]
|
307 |
|
308 |
elif file_extension in '.csv':
|
|
|
352 |
output_files.append(out_pdf_file_path)
|
353 |
|
354 |
else:
|
355 |
+
print("PDF input not found. Outputs not saved to PDF.")
|
356 |
|
357 |
# If save_pdf is not true, then add the original pdf to the output files
|
358 |
else:
|
|
|
505 |
redact_annot.set('interior-color', colour_str)
|
506 |
#redact_annot.set('fill-color', colour_str)
|
507 |
#redact_annot.set('outline-color', colour_str)
|
508 |
+
#redact_annot.set('overlay-color', colour_str)
|
509 |
+
#redact_annot.set('overlay-text', row['label'])
|
510 |
redact_annot.set('opacity', "0.5")
|
511 |
|
512 |
# Add appearance dictionary
|