Spaces:

seanpedrickcase
/

document_redaction

Running

Sean Pedrick-Case commited on Feb 13

Commit

944dfca

unverified ·

Merge pull request #8 from seanpedrick-case/dev

Fixed csv/xlsx redaction.
Updated guide on creating exe.
Corrected image coordinate translation when the pdf mediabox is not the same size as pdf page rectangle
Fixed issues with gradio version 5.16.
Fixed fuzzy search error with pages with no data.
Added git to Dockerfile to be able to install git-based custom gradio components

Files changed (10) hide show

DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec +18 -4
Dockerfile +2 -1
app.py +1 -1
how_to_create_exe_dist.txt +24 -6
requirements.txt +5 -2
tools/data_anonymise.py +6 -0
tools/file_conversion.py +124 -27
tools/file_redaction.py +11 -4
tools/load_spacy_model_custom_recognisers.py +8 -8
tools/redaction_review.py +10 -5

DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec RENAMED Viewed

@@ -1,17 +1,31 @@
 # -*- mode: python ; coding: utf-8 -*-
 from PyInstaller.utils.hooks import collect_data_files
 datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
 datas += collect_data_files('gradio_client')
 datas += collect_data_files('gradio')
 a = Analysis(
     ['app.py'],
     pathex=[],
-    binaries=[],
     datas=datas,
-    hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
     hookspath=['build_deps'],
     hooksconfig={},
     runtime_hooks=[],
@@ -29,7 +43,7 @@ exe = EXE(
     a.scripts,
     [],
     exclude_binaries=True,
-    name='DocRedactApp_0.2',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
@@ -48,5 +62,5 @@ coll = COLLECT(
     strip=False,
     upx=True,
     upx_exclude=[],
-    name='DocRedactApp_0.2',
 )

 # -*- mode: python ; coding: utf-8 -*-
 from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_all
 datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
+binaries = []
+hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
 datas += collect_data_files('gradio_client')
 datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_image_annotation')
+tmp_ret = collect_all('gradio_image_annotation')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('safehttpx')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_analyzer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_anonymizer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_image_redactor')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 a = Analysis(
     ['app.py'],
     pathex=[],
+    binaries=binaries,
     datas=datas,
+    hiddenimports=hiddenimports,
     hookspath=['build_deps'],
     hooksconfig={},
     runtime_hooks=[],
     a.scripts,
     [],
     exclude_binaries=True,
+    name='DocRedactApp_0.2.0',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
     strip=False,
     upx=True,
     upx_exclude=[],
+    name='DocRedactApp_0.2.0',
 )

Dockerfile CHANGED Viewed

@@ -8,7 +8,8 @@ RUN apt-get update \
         make \
         cmake \
         unzip \
-        libcurl4-openssl-dev \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

         make \
         cmake \
         unzip \
+        libcurl4-openssl-dev \
+        git \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -453,7 +453,7 @@ with app:
     # TABULAR DATA REDACTION
     ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
-                  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
     tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")

     # TABULAR DATA REDACTION
     ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
+                  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
     tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")

how_to_create_exe_dist.txt CHANGED Viewed

@@ -12,9 +12,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 8. In command line, cd to the folder that contains app.py.
-9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio  --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders  --name DocRedactApp_0.2 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
@@ -28,11 +28,29 @@ a = Analysis(
     }
 )
-c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.spec
-9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
-10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
-11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.

 8. In command line, cd to the folder that contains app.py.
+9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
     }
 )
+hook-presidio-image-redactor.py
+c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
+9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
+10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
+def create_or_modify_pyi(
+    component_class: type, class_name: str, events: list[str | EventListener]
+):
+    source_file = Path(inspect.getfile(component_class))
+    try:
+        # Try to read the source file
+        source_code = source_file.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        # If source file not found, skip pyi generation
+        return None
+11. Copy the poppler and tesseract folders into the location where the .exe is
+12. In 'dist\redaction' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
+12. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.

requirements.txt CHANGED Viewed

@@ -12,14 +12,17 @@ scikit-learn==1.5.2
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.12.0
-boto3==1.35.83
 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
 python-levenshtein==0.26.1
 spaczz==0.6.1
 gradio_image_annotation==0.2.5
 numpy==1.26.4
 awslambdaric==3.0.0

 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.16.0
+boto3==1.36.15
 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
 python-levenshtein==0.26.1
 spaczz==0.6.1
 gradio_image_annotation==0.2.5
+# The following version includes rotation and image zoom options - not currently working so reverting to original until fixed
+#git+https://github.com/seanpedrick-case/gradio_image_annotator
+rapidfuzz==3.12.1
 numpy==1.26.4
 awslambdaric==3.0.0

tools/data_anonymise.py CHANGED Viewed

@@ -389,6 +389,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
     if isinstance(out_message, str):
         out_message = [out_message]
     if not out_file_paths:
         out_file_paths = []
@@ -473,6 +478,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

     if isinstance(out_message, str):
         out_message = [out_message]
+    print("log_files_output_paths:",log_files_output_paths)
+    if isinstance(log_files_output_paths, str):
+        log_files_output_paths = []
     if not out_file_paths:
         out_file_paths = []
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

tools/file_conversion.py CHANGED Viewed

@@ -304,44 +304,138 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
     #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
     shape.commit()
 def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
     '''
     Converts coordinates from pymupdf format to image coordinates,
-    accounting for mediabox dimensions.
     '''
-    rect_height = pymupdf_page.rect.height
-    rect_width = pymupdf_page.rect.width
-    # Get mediabox dimensions
     mediabox = pymupdf_page.mediabox
     mediabox_width = mediabox.width
     mediabox_height = mediabox.height
     image_page_width, image_page_height = image.size
-    # Calculate scaling factors using mediabox dimensions
-    scale_width = image_page_width / mediabox_width
-    scale_height = image_page_height / mediabox_height
-    #print("scale_width:", scale_width)
-    #print("scale_height:", scale_height)
-    rect_to_mediabox_x_scale = mediabox_width / rect_width
-    rect_to_mediabox_y_scale = mediabox_height / rect_height
-    #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
-    #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
-    # Adjust coordinates based on scaling factors
-    x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
-    y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
-    x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
-    y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
     return x1_image, y1_image, x2_image, y2_image
 def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
     # Small border to page that remains white
     border = 5
@@ -598,13 +692,16 @@ def prepare_image_or_pdf(
                             all_annotations_object.append(annotation)
                         #print("annotation:", annotation, "for page:", str(i))
-                        if not annotation:
-                            annotation = {"image":"", "boxes": []}
-                            annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
-                        else:
-                            annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
                         #print("Annotation page number:", annotation_page_number)
                         # Check if the annotation page number exists in the image file paths pages
@@ -744,7 +841,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
             #print(number)  # Output: 0
             reported_number = int(number) + 1
         else:
-            print("No number found before .png")
             reported_number = 1
         # Check if 'boxes' is in the annotation, if not, add an empty list

     #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
     shape.commit()
+# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
+#     '''
+#     Converts coordinates from pymupdf format to image coordinates,
+#     accounting for mediabox dimensions and offset.
+#     '''
+#     # Get rect dimensions
+#     rect = pymupdf_page.rect
+#     rect_width = rect.width
+#     rect_height = rect.height
+#     # Get mediabox dimensions and position
+#     mediabox = pymupdf_page.mediabox
+#     mediabox_width = mediabox.width
+#     mediabox_height = mediabox.height
+#     # Get target image dimensions
+#     image_page_width, image_page_height = image.size
+#     # Calculate scaling factors
+#     image_to_mediabox_x_scale = image_page_width / mediabox_width
+#     image_to_mediabox_y_scale = image_page_height / mediabox_height
+#     image_to_rect_scale_width = image_page_width / rect_width
+#     image_to_rect_scale_height = image_page_height / rect_height
+#     # Adjust for offsets (difference in position between mediabox and rect)
+#     x_offset = rect.x0 - mediabox.x0  # Difference in x position
+#     y_offset = rect.y0 - mediabox.y0  # Difference in y position
+#     print("x_offset:", x_offset)
+#     print("y_offset:", y_offset)
+#     # Adjust coordinates:
+#     # Apply scaling to match image dimensions
+#     x1_image = x1 * image_to_mediabox_x_scale
+#     x2_image = x2 * image_to_mediabox_x_scale
+#     y1_image = y1 * image_to_mediabox_y_scale
+#     y2_image = y2 * image_to_mediabox_y_scale
+#     # Correct for difference in rect and mediabox size
+#     if mediabox_width != rect_width:
+#         mediabox_to_rect_x_scale = mediabox_width / rect_width
+#         mediabox_to_rect_y_scale = mediabox_height / rect_height
+#         x1_image *= mediabox_to_rect_x_scale
+#         x2_image *= mediabox_to_rect_x_scale
+#         y1_image *= mediabox_to_rect_y_scale
+#         y2_image *= mediabox_to_rect_y_scale
+#         print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
+#         #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
+#         print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
+#         #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
+#         mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
+#         mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
+#         x1_image -= mediabox_rect_x_diff
+#         x2_image -= mediabox_rect_x_diff
+#         y1_image += mediabox_rect_y_diff
+#         y2_image += mediabox_rect_y_diff
+#     return x1_image, y1_image, x2_image, y2_image
 def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
     '''
     Converts coordinates from pymupdf format to image coordinates,
+    accounting for mediabox dimensions and offset.
     '''
+    # Get rect dimensions
+    rect = pymupdf_page.rect
+    rect_width = rect.width
+    rect_height = rect.height
+    # Get mediabox dimensions and position
     mediabox = pymupdf_page.mediabox
     mediabox_width = mediabox.width
     mediabox_height = mediabox.height
+    # Get target image dimensions
     image_page_width, image_page_height = image.size
+    # Calculate scaling factors
+    image_to_mediabox_x_scale = image_page_width / mediabox_width
+    image_to_mediabox_y_scale = image_page_height / mediabox_height
+    image_to_rect_scale_width = image_page_width / rect_width
+    image_to_rect_scale_height = image_page_height / rect_height
+    # Adjust for offsets (difference in position between mediabox and rect)
+    x_offset = rect.x0 - mediabox.x0  # Difference in x position
+    y_offset = rect.y0 - mediabox.y0  # Difference in y position
+    #print("x_offset:", x_offset)
+    #print("y_offset:", y_offset)
+    # Adjust coordinates:
+    # Apply scaling to match image dimensions
+    x1_image = x1 * image_to_mediabox_x_scale
+    x2_image = x2 * image_to_mediabox_x_scale
+    y1_image = y1 * image_to_mediabox_y_scale
+    y2_image = y2 * image_to_mediabox_y_scale
+    # Correct for difference in rect and mediabox size
+    if mediabox_width != rect_width:
+        mediabox_to_rect_x_scale = mediabox_width / rect_width
+        mediabox_to_rect_y_scale = mediabox_height / rect_height
+        rect_to_mediabox_x_scale = rect_width / mediabox_width
+        #rect_to_mediabox_y_scale = rect_height / mediabox_height
+        mediabox_rect_x_diff = (mediabox_width - rect_width) * (image_to_mediabox_x_scale / 2)
+        mediabox_rect_y_diff = (mediabox_height - rect_height) * (image_to_mediabox_y_scale / 2)
+        x1_image -= mediabox_rect_x_diff
+        x2_image -= mediabox_rect_x_diff
+        y1_image += mediabox_rect_y_diff
+        y2_image += mediabox_rect_y_diff
+        #
+        x1_image *= mediabox_to_rect_x_scale
+        x2_image *= mediabox_to_rect_x_scale
+        y1_image *= mediabox_to_rect_y_scale
+        y2_image *= mediabox_to_rect_y_scale
     return x1_image, y1_image, x2_image, y2_image
 def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
     # Small border to page that remains white
     border = 5
                             all_annotations_object.append(annotation)
                         #print("annotation:", annotation, "for page:", str(i))
+                        try:
+                            if not annotation:
+                                annotation = {"image":"", "boxes": []}
+                                annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
+                            else:
+                                annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
+                        except Exception as e:
+                            print("Extracting page number from image failed due to:", e)
+                            annotation_page_number = 0
                         #print("Annotation page number:", annotation_page_number)
                         # Check if the annotation page number exists in the image file paths pages
             #print(number)  # Output: 0
             reported_number = int(number) + 1
         else:
+            print("No number found before .png. Returning page 1.")
             reported_number = 1
         # Check if 'boxes' is in the annotation, if not, add an empty list

tools/file_redaction.py CHANGED Viewed

@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
-        custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
     if isinstance(redact_whole_page_list, pd.DataFrame):
-        redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
-                print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)

     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
+        if not custom_recogniser_word_list.empty:
+            custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
+        else:
+            # Handle the case where the DataFrame is empty
+            custom_recogniser_word_list = []  # or some default value
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
     if isinstance(redact_whole_page_list, pd.DataFrame):
+        if not redact_whole_page_list.empty:
+            redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
+        else:
+            # Handle the case where the DataFrame is empty
+            redact_whole_page_list = []  # or some default value
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
             ## Apply annotations with pymupdf
             else:
+                #print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
     #print("custom_query_list:", custom_query_list)
     if not text:
-        out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
         print(out_message)
-        return out_message, None
     for string_query in custom_query_list:
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
-                    print("doc:", doc)
-                    print("span:", span)
-                    print("query_search:", query_search)
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
-                    print("Levenshtein distance:", distance)
                     if distance > spelling_mistakes_max:
                         match_count = match_count - 1
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
-                        print("start_char:", start_char)
-                        print("end_char:", end_char)
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)

     #print("custom_query_list:", custom_query_list)
     if not text:
+        out_message = "No text data found. Skipping page."
         print(out_message)
+        return all_start_positions, all_end_positions
     for string_query in custom_query_list:
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    #print("Levenshtein distance:", distance)
                     if distance > spelling_mistakes_max:
                         match_count = match_count - 1
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                        #print("start_char:", start_char)
+                        #print("end_char:", end_char)
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)

tools/redaction_review.py CHANGED Viewed

@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         page_num_reported = 1
         out_image_annotator = image_annotator(
-        image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
         box_thickness=1,
         label_list=recogniser_entities_list,
@@ -295,9 +295,14 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                     fill = img_annotation_box["color"]
                     draw.rectangle(coords, fill=fill)
                     image.save(output_folder + file_name_without_ext + "_redacted.png")
                 doc = [image]
             elif file_extension in '.csv':
@@ -347,7 +352,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                 output_files.append(out_pdf_file_path)
             else:
-                print("PDF input not found.")
         # If save_pdf is not true, then add the original pdf to the output files
         else:
@@ -500,8 +505,8 @@ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
         redact_annot.set('interior-color', colour_str)
         #redact_annot.set('fill-color', colour_str)
         #redact_annot.set('outline-color', colour_str)
-        redact_annot.set('overlay-color', colour_str)
-        redact_annot.set('overlay-text', row['label'])
         redact_annot.set('opacity', "0.5")
         # Add appearance dictionary

         page_num_reported = 1
         out_image_annotator = image_annotator(
+        None,
         boxes_alpha=0.1,
         box_thickness=1,
         label_list=recogniser_entities_list,
                     fill = img_annotation_box["color"]
                     draw.rectangle(coords, fill=fill)
+                    output_image_path = output_folder + file_name_without_ext + "_redacted.png"
                     image.save(output_folder + file_name_without_ext + "_redacted.png")
+                output_files.append(output_image_path)
+                print("Redactions saved to image file")
                 doc = [image]
             elif file_extension in '.csv':
                 output_files.append(out_pdf_file_path)
             else:
+                print("PDF input not found. Outputs not saved to PDF.")
         # If save_pdf is not true, then add the original pdf to the output files
         else:
         redact_annot.set('interior-color', colour_str)
         #redact_annot.set('fill-color', colour_str)
         #redact_annot.set('outline-color', colour_str)
+        #redact_annot.set('overlay-color', colour_str)
+        #redact_annot.set('overlay-text', row['label'])
         redact_annot.set('opacity', "0.5")
         # Add appearance dictionary