Sean Pedrick-Case commited on
Commit
944dfca
·
unverified ·
2 Parent(s): 713ca11 7917a26

Merge pull request #8 from seanpedrick-case/dev

Browse files

Fixed csv/xlsx redaction.
Updated guide on creating exe.
Corrected image coordinate translation when the pdf mediabox is not the same size as pdf page rectangle
Fixed issues with gradio version 5.16.
Fixed fuzzy search error with pages with no data.
Added git to Dockerfile to be able to install git-based custom gradio components

DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec RENAMED
@@ -1,17 +1,31 @@
1
  # -*- mode: python ; coding: utf-8 -*-
2
  from PyInstaller.utils.hooks import collect_data_files
 
3
 
4
  datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
 
 
5
  datas += collect_data_files('gradio_client')
6
  datas += collect_data_files('gradio')
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  a = Analysis(
10
  ['app.py'],
11
  pathex=[],
12
- binaries=[],
13
  datas=datas,
14
- hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
15
  hookspath=['build_deps'],
16
  hooksconfig={},
17
  runtime_hooks=[],
@@ -29,7 +43,7 @@ exe = EXE(
29
  a.scripts,
30
  [],
31
  exclude_binaries=True,
32
- name='DocRedactApp_0.2',
33
  debug=False,
34
  bootloader_ignore_signals=False,
35
  strip=False,
@@ -48,5 +62,5 @@ coll = COLLECT(
48
  strip=False,
49
  upx=True,
50
  upx_exclude=[],
51
- name='DocRedactApp_0.2',
52
  )
 
1
  # -*- mode: python ; coding: utf-8 -*-
2
  from PyInstaller.utils.hooks import collect_data_files
3
+ from PyInstaller.utils.hooks import collect_all
4
 
5
  datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
6
+ binaries = []
7
+ hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
8
  datas += collect_data_files('gradio_client')
9
  datas += collect_data_files('gradio')
10
+ datas += collect_data_files('gradio_image_annotation')
11
+ tmp_ret = collect_all('gradio_image_annotation')
12
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
13
+ tmp_ret = collect_all('safehttpx')
14
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
15
+ tmp_ret = collect_all('presidio_analyzer')
16
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
17
+ tmp_ret = collect_all('presidio_anonymizer')
18
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
19
+ tmp_ret = collect_all('presidio_image_redactor')
20
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
21
 
22
 
23
  a = Analysis(
24
  ['app.py'],
25
  pathex=[],
26
+ binaries=binaries,
27
  datas=datas,
28
+ hiddenimports=hiddenimports,
29
  hookspath=['build_deps'],
30
  hooksconfig={},
31
  runtime_hooks=[],
 
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
+ name='DocRedactApp_0.2.0',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp_0.2.0',
66
  )
Dockerfile CHANGED
@@ -8,7 +8,8 @@ RUN apt-get update \
8
  make \
9
  cmake \
10
  unzip \
11
- libcurl4-openssl-dev \
 
12
  && apt-get clean \
13
  && rm -rf /var/lib/apt/lists/*
14
 
 
8
  make \
9
  cmake \
10
  unzip \
11
+ libcurl4-openssl-dev \
12
+ git \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
15
 
app.py CHANGED
@@ -453,7 +453,7 @@ with app:
453
  # TABULAR DATA REDACTION
454
  ###
455
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
456
- then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
457
 
458
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
459
 
 
453
  # TABULAR DATA REDACTION
454
  ###
455
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
456
+ then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
457
 
458
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
459
 
how_to_create_exe_dist.txt CHANGED
@@ -12,9 +12,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
12
 
13
  8. In command line, cd to the folder that contains app.py.
14
 
15
- 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.2 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
@@ -28,11 +28,29 @@ a = Analysis(
28
  }
29
  )
30
 
31
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.spec
32
 
 
33
 
34
- 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
35
 
36
- 10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
37
 
38
- 11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  8. In command line, cd to the folder that contains app.py.
14
 
15
+ 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
 
28
  }
29
  )
30
 
31
+ hook-presidio-image-redactor.py
32
 
33
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
34
 
 
35
 
36
+ 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
37
 
38
+ 10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
39
+
40
+ def create_or_modify_pyi(
41
+ component_class: type, class_name: str, events: list[str | EventListener]
42
+ ):
43
+ source_file = Path(inspect.getfile(component_class))
44
+
45
+ try:
46
+ # Try to read the source file
47
+ source_code = source_file.read_text(encoding="utf-8")
48
+ except FileNotFoundError:
49
+ # If source file not found, skip pyi generation
50
+ return None
51
+
52
+ 11. Copy the poppler and tesseract folders into the location where the .exe is
53
+
54
+ 12. In 'dist\redaction' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
55
+
56
+ 12. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
requirements.txt CHANGED
@@ -12,14 +12,17 @@ scikit-learn==1.5.2
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.12.0
16
- boto3==1.35.83
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
19
  Faker==22.2.0
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
  gradio_image_annotation==0.2.5
 
 
 
23
  numpy==1.26.4
24
  awslambdaric==3.0.0
25
 
 
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.16.0
16
+ boto3==1.36.15
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
19
  Faker==22.2.0
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
  gradio_image_annotation==0.2.5
23
+ # The following version includes rotation and image zoom options - not currently working so reverting to original until fixed
24
+ #git+https://github.com/seanpedrick-case/gradio_image_annotator
25
+ rapidfuzz==3.12.1
26
  numpy==1.26.4
27
  awslambdaric==3.0.0
28
 
tools/data_anonymise.py CHANGED
@@ -389,6 +389,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
389
  if isinstance(out_message, str):
390
  out_message = [out_message]
391
 
 
 
 
 
 
392
  if not out_file_paths:
393
  out_file_paths = []
394
 
@@ -473,6 +478,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
  out_file_part = get_file_name_without_type(anon_file.name)
 
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
 
389
  if isinstance(out_message, str):
390
  out_message = [out_message]
391
 
392
+ print("log_files_output_paths:",log_files_output_paths)
393
+
394
+ if isinstance(log_files_output_paths, str):
395
+ log_files_output_paths = []
396
+
397
  if not out_file_paths:
398
  out_file_paths = []
399
 
 
478
  sheet_name = ""
479
  anon_df = read_file(anon_file)
480
  out_file_part = get_file_name_without_type(anon_file.name)
481
+
482
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
483
 
484
  # Increase latest file completed count unless we are at the last file
tools/file_conversion.py CHANGED
@@ -304,44 +304,138 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
  '''
309
  Converts coordinates from pymupdf format to image coordinates,
310
- accounting for mediabox dimensions.
311
  '''
 
 
 
 
312
 
313
- rect_height = pymupdf_page.rect.height
314
- rect_width = pymupdf_page.rect.width
315
-
316
- # Get mediabox dimensions
317
  mediabox = pymupdf_page.mediabox
318
  mediabox_width = mediabox.width
319
  mediabox_height = mediabox.height
320
 
 
321
  image_page_width, image_page_height = image.size
322
 
323
- # Calculate scaling factors using mediabox dimensions
324
- scale_width = image_page_width / mediabox_width
325
- scale_height = image_page_height / mediabox_height
 
 
 
326
 
327
- #print("scale_width:", scale_width)
328
- #print("scale_height:", scale_height)
 
329
 
330
- rect_to_mediabox_x_scale = mediabox_width / rect_width
331
- rect_to_mediabox_y_scale = mediabox_height / rect_height
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
334
- #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
335
 
336
- # Adjust coordinates based on scaling factors
337
- x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
338
- y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
339
- x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
340
- y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
 
 
 
 
 
 
 
 
341
 
342
  return x1_image, y1_image, x2_image, y2_image
343
 
344
 
 
345
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
346
  # Small border to page that remains white
347
  border = 5
@@ -598,13 +692,16 @@ def prepare_image_or_pdf(
598
  all_annotations_object.append(annotation)
599
 
600
  #print("annotation:", annotation, "for page:", str(i))
601
-
602
- if not annotation:
603
- annotation = {"image":"", "boxes": []}
604
- annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
605
-
606
- else:
607
- annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
 
 
 
608
  #print("Annotation page number:", annotation_page_number)
609
 
610
  # Check if the annotation page number exists in the image file paths pages
@@ -744,7 +841,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
744
  #print(number) # Output: 0
745
  reported_number = int(number) + 1
746
  else:
747
- print("No number found before .png")
748
  reported_number = 1
749
 
750
  # Check if 'boxes' is in the annotation, if not, add an empty list
 
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
307
+ # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
+ # '''
309
+ # Converts coordinates from pymupdf format to image coordinates,
310
+ # accounting for mediabox dimensions and offset.
311
+ # '''
312
+ # # Get rect dimensions
313
+ # rect = pymupdf_page.rect
314
+ # rect_width = rect.width
315
+ # rect_height = rect.height
316
+
317
+ # # Get mediabox dimensions and position
318
+ # mediabox = pymupdf_page.mediabox
319
+ # mediabox_width = mediabox.width
320
+ # mediabox_height = mediabox.height
321
+
322
+ # # Get target image dimensions
323
+ # image_page_width, image_page_height = image.size
324
+
325
+ # # Calculate scaling factors
326
+ # image_to_mediabox_x_scale = image_page_width / mediabox_width
327
+ # image_to_mediabox_y_scale = image_page_height / mediabox_height
328
+
329
+ # image_to_rect_scale_width = image_page_width / rect_width
330
+ # image_to_rect_scale_height = image_page_height / rect_height
331
+
332
+ # # Adjust for offsets (difference in position between mediabox and rect)
333
+ # x_offset = rect.x0 - mediabox.x0 # Difference in x position
334
+ # y_offset = rect.y0 - mediabox.y0 # Difference in y position
335
+
336
+ # print("x_offset:", x_offset)
337
+ # print("y_offset:", y_offset)
338
+
339
+ # # Adjust coordinates:
340
+ # # Apply scaling to match image dimensions
341
+ # x1_image = x1 * image_to_mediabox_x_scale
342
+ # x2_image = x2 * image_to_mediabox_x_scale
343
+ # y1_image = y1 * image_to_mediabox_y_scale
344
+ # y2_image = y2 * image_to_mediabox_y_scale
345
+
346
+ # # Correct for difference in rect and mediabox size
347
+ # if mediabox_width != rect_width:
348
+
349
+ # mediabox_to_rect_x_scale = mediabox_width / rect_width
350
+ # mediabox_to_rect_y_scale = mediabox_height / rect_height
351
+
352
+ # x1_image *= mediabox_to_rect_x_scale
353
+ # x2_image *= mediabox_to_rect_x_scale
354
+ # y1_image *= mediabox_to_rect_y_scale
355
+ # y2_image *= mediabox_to_rect_y_scale
356
+
357
+ # print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
358
+ # #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
359
+
360
+ # print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
361
+ # #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
362
+
363
+ # mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
364
+ # mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
365
+
366
+ # x1_image -= mediabox_rect_x_diff
367
+ # x2_image -= mediabox_rect_x_diff
368
+ # y1_image += mediabox_rect_y_diff
369
+ # y2_image += mediabox_rect_y_diff
370
+
371
+ # return x1_image, y1_image, x2_image, y2_image
372
+
373
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
374
  '''
375
  Converts coordinates from pymupdf format to image coordinates,
376
+ accounting for mediabox dimensions and offset.
377
  '''
378
+ # Get rect dimensions
379
+ rect = pymupdf_page.rect
380
+ rect_width = rect.width
381
+ rect_height = rect.height
382
 
383
+ # Get mediabox dimensions and position
 
 
 
384
  mediabox = pymupdf_page.mediabox
385
  mediabox_width = mediabox.width
386
  mediabox_height = mediabox.height
387
 
388
+ # Get target image dimensions
389
  image_page_width, image_page_height = image.size
390
 
391
+ # Calculate scaling factors
392
+ image_to_mediabox_x_scale = image_page_width / mediabox_width
393
+ image_to_mediabox_y_scale = image_page_height / mediabox_height
394
+
395
+ image_to_rect_scale_width = image_page_width / rect_width
396
+ image_to_rect_scale_height = image_page_height / rect_height
397
 
398
+ # Adjust for offsets (difference in position between mediabox and rect)
399
+ x_offset = rect.x0 - mediabox.x0 # Difference in x position
400
+ y_offset = rect.y0 - mediabox.y0 # Difference in y position
401
 
402
+ #print("x_offset:", x_offset)
403
+ #print("y_offset:", y_offset)
404
+
405
+ # Adjust coordinates:
406
+ # Apply scaling to match image dimensions
407
+ x1_image = x1 * image_to_mediabox_x_scale
408
+ x2_image = x2 * image_to_mediabox_x_scale
409
+ y1_image = y1 * image_to_mediabox_y_scale
410
+ y2_image = y2 * image_to_mediabox_y_scale
411
+
412
+ # Correct for difference in rect and mediabox size
413
+ if mediabox_width != rect_width:
414
+
415
+ mediabox_to_rect_x_scale = mediabox_width / rect_width
416
+ mediabox_to_rect_y_scale = mediabox_height / rect_height
417
 
418
+ rect_to_mediabox_x_scale = rect_width / mediabox_width
419
+ #rect_to_mediabox_y_scale = rect_height / mediabox_height
420
 
421
+ mediabox_rect_x_diff = (mediabox_width - rect_width) * (image_to_mediabox_x_scale / 2)
422
+ mediabox_rect_y_diff = (mediabox_height - rect_height) * (image_to_mediabox_y_scale / 2)
423
+
424
+ x1_image -= mediabox_rect_x_diff
425
+ x2_image -= mediabox_rect_x_diff
426
+ y1_image += mediabox_rect_y_diff
427
+ y2_image += mediabox_rect_y_diff
428
+
429
+ #
430
+ x1_image *= mediabox_to_rect_x_scale
431
+ x2_image *= mediabox_to_rect_x_scale
432
+ y1_image *= mediabox_to_rect_y_scale
433
+ y2_image *= mediabox_to_rect_y_scale
434
 
435
  return x1_image, y1_image, x2_image, y2_image
436
 
437
 
438
+
439
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
440
  # Small border to page that remains white
441
  border = 5
 
692
  all_annotations_object.append(annotation)
693
 
694
  #print("annotation:", annotation, "for page:", str(i))
695
+ try:
696
+ if not annotation:
697
+ annotation = {"image":"", "boxes": []}
698
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
699
+
700
+ else:
701
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
702
+ except Exception as e:
703
+ print("Extracting page number from image failed due to:", e)
704
+ annotation_page_number = 0
705
  #print("Annotation page number:", annotation_page_number)
706
 
707
  # Check if the annotation page number exists in the image file paths pages
 
841
  #print(number) # Output: 0
842
  reported_number = int(number) + 1
843
  else:
844
+ print("No number found before .png. Returning page 1.")
845
  reported_number = 1
846
 
847
  # Check if 'boxes' is in the annotation, if not, add an empty list
tools/file_redaction.py CHANGED
@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
144
  review_out_file_paths = [prepared_pdf_file_paths[0]]
145
 
146
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
147
- custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
 
 
 
 
148
 
149
  # Sort the strings in order from the longest string to the shortest
150
  custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
151
 
152
  if isinstance(redact_whole_page_list, pd.DataFrame):
153
- redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
154
-
 
 
 
155
 
156
  # If this is the first time around, set variables to 0/blank
157
  if first_loop_state==True:
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
1209
 
1210
  ## Apply annotations with pymupdf
1211
  else:
1212
- print("merged_redaction_boxes:", merged_redaction_bboxes)
1213
  #print("redact_whole_page_list:", redact_whole_page_list)
1214
  if redact_whole_page_list:
1215
  int_reported_page_number = int(reported_page_number)
 
144
  review_out_file_paths = [prepared_pdf_file_paths[0]]
145
 
146
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
147
+ if not custom_recogniser_word_list.empty:
148
+ custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
149
+ else:
150
+ # Handle the case where the DataFrame is empty
151
+ custom_recogniser_word_list = [] # or some default value
152
 
153
  # Sort the strings in order from the longest string to the shortest
154
  custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
155
 
156
  if isinstance(redact_whole_page_list, pd.DataFrame):
157
+ if not redact_whole_page_list.empty:
158
+ redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
159
+ else:
160
+ # Handle the case where the DataFrame is empty
161
+ redact_whole_page_list = [] # or some default value
162
 
163
  # If this is the first time around, set variables to 0/blank
164
  if first_loop_state==True:
 
1216
 
1217
  ## Apply annotations with pymupdf
1218
  else:
1219
+ #print("merged_redaction_boxes:", merged_redaction_bboxes)
1220
  #print("redact_whole_page_list:", redact_whole_page_list)
1221
  if redact_whole_page_list:
1222
  int_reported_page_number = int(reported_page_number)
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
184
  #print("custom_query_list:", custom_query_list)
185
 
186
  if not text:
187
- out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
188
  print(out_message)
189
- return out_message, None
190
 
191
  for string_query in custom_query_list:
192
 
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
254
  for match_id, start, end, ratio, pattern in matches:
255
  span = str(doc[start:end]).strip()
256
  query_search = str(query).strip()
257
- print("doc:", doc)
258
- print("span:", span)
259
- print("query_search:", query_search)
260
 
261
  # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
  distance = Levenshtein.distance(query_search.lower(), span.lower())
263
 
264
- print("Levenshtein distance:", distance)
265
 
266
  if distance > spelling_mistakes_max:
267
  match_count = match_count - 1
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
270
  start_char = doc[start].idx # Start character position
271
  end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
 
273
- print("start_char:", start_char)
274
- print("end_char:", end_char)
275
 
276
  all_matches.append(match_count)
277
  all_start_positions.append(start_char)
 
184
  #print("custom_query_list:", custom_query_list)
185
 
186
  if not text:
187
+ out_message = "No text data found. Skipping page."
188
  print(out_message)
189
+ return all_start_positions, all_end_positions
190
 
191
  for string_query in custom_query_list:
192
 
 
254
  for match_id, start, end, ratio, pattern in matches:
255
  span = str(doc[start:end]).strip()
256
  query_search = str(query).strip()
257
+ #print("doc:", doc)
258
+ #print("span:", span)
259
+ #print("query_search:", query_search)
260
 
261
  # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
  distance = Levenshtein.distance(query_search.lower(), span.lower())
263
 
264
+ #print("Levenshtein distance:", distance)
265
 
266
  if distance > spelling_mistakes_max:
267
  match_count = match_count - 1
 
270
  start_char = doc[start].idx # Start character position
271
  end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
 
273
+ #print("start_char:", start_char)
274
+ #print("end_char:", end_char)
275
 
276
  all_matches.append(match_count)
277
  all_start_positions.append(start_char)
tools/redaction_review.py CHANGED
@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
137
  page_num_reported = 1
138
 
139
  out_image_annotator = image_annotator(
140
- image_annotator_object[page_num_reported - 1],
141
  boxes_alpha=0.1,
142
  box_thickness=1,
143
  label_list=recogniser_entities_list,
@@ -295,9 +295,14 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
295
  fill = img_annotation_box["color"]
296
 
297
  draw.rectangle(coords, fill=fill)
298
-
 
299
  image.save(output_folder + file_name_without_ext + "_redacted.png")
300
 
 
 
 
 
301
  doc = [image]
302
 
303
  elif file_extension in '.csv':
@@ -347,7 +352,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
347
  output_files.append(out_pdf_file_path)
348
 
349
  else:
350
- print("PDF input not found.")
351
 
352
  # If save_pdf is not true, then add the original pdf to the output files
353
  else:
@@ -500,8 +505,8 @@ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
500
  redact_annot.set('interior-color', colour_str)
501
  #redact_annot.set('fill-color', colour_str)
502
  #redact_annot.set('outline-color', colour_str)
503
- redact_annot.set('overlay-color', colour_str)
504
- redact_annot.set('overlay-text', row['label'])
505
  redact_annot.set('opacity', "0.5")
506
 
507
  # Add appearance dictionary
 
137
  page_num_reported = 1
138
 
139
  out_image_annotator = image_annotator(
140
+ None,
141
  boxes_alpha=0.1,
142
  box_thickness=1,
143
  label_list=recogniser_entities_list,
 
295
  fill = img_annotation_box["color"]
296
 
297
  draw.rectangle(coords, fill=fill)
298
+
299
+ output_image_path = output_folder + file_name_without_ext + "_redacted.png"
300
  image.save(output_folder + file_name_without_ext + "_redacted.png")
301
 
302
+ output_files.append(output_image_path)
303
+
304
+ print("Redactions saved to image file")
305
+
306
  doc = [image]
307
 
308
  elif file_extension in '.csv':
 
352
  output_files.append(out_pdf_file_path)
353
 
354
  else:
355
+ print("PDF input not found. Outputs not saved to PDF.")
356
 
357
  # If save_pdf is not true, then add the original pdf to the output files
358
  else:
 
505
  redact_annot.set('interior-color', colour_str)
506
  #redact_annot.set('fill-color', colour_str)
507
  #redact_annot.set('outline-color', colour_str)
508
+ #redact_annot.set('overlay-color', colour_str)
509
+ #redact_annot.set('overlay-text', row['label'])
510
  redact_annot.set('opacity', "0.5")
511
 
512
  # Add appearance dictionary