Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Feb 13

Commit

3cecbfa

1 Parent(s): 760ef5c

Fixed issues with gradio version 5.16. Fixed fuzzy search error with pages with no data.

Browse files

Files changed (4) hide show

requirements.txt +1 -2
tools/file_redaction.py +11 -4
tools/load_spacy_model_custom_recognisers.py +8 -8
tools/redaction_review.py +1 -1

requirements.txt CHANGED Viewed

@@ -12,7 +12,7 @@ scikit-learn==1.5.2
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.15.0
 boto3==1.36.15
 pyarrow==18.1.0
 openpyxl==3.1.2
@@ -22,7 +22,6 @@ spaczz==0.6.1
 #gradio_image_annotation==0.2.5
 # The following version includes rotation and image zoom options
 git+https://github.com/seanpedrick-case/gradio_image_annotator
 rapidfuzz==3.12.1
 numpy==1.26.4
 awslambdaric==3.0.0

 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.16.0
 boto3==1.36.15
 pyarrow==18.1.0
 openpyxl==3.1.2
 #gradio_image_annotation==0.2.5
 # The following version includes rotation and image zoom options
 git+https://github.com/seanpedrick-case/gradio_image_annotator
 rapidfuzz==3.12.1
 numpy==1.26.4
 awslambdaric==3.0.0

tools/file_redaction.py CHANGED Viewed

@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
-        custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
     if isinstance(redact_whole_page_list, pd.DataFrame):
-        redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
-                print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)

     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
+        if not custom_recogniser_word_list.empty:
+            custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
+        else:
+            # Handle the case where the DataFrame is empty
+            custom_recogniser_word_list = []  # or some default value
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
     if isinstance(redact_whole_page_list, pd.DataFrame):
+        if not redact_whole_page_list.empty:
+            redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
+        else:
+            # Handle the case where the DataFrame is empty
+            redact_whole_page_list = []  # or some default value
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
             ## Apply annotations with pymupdf
             else:
+                #print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
     #print("custom_query_list:", custom_query_list)
     if not text:
-        out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
         print(out_message)
-        return out_message, None
     for string_query in custom_query_list:
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
-                    print("doc:", doc)
-                    print("span:", span)
-                    print("query_search:", query_search)
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
-                    print("Levenshtein distance:", distance)
                     if distance > spelling_mistakes_max:
                         match_count = match_count - 1
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
-                        print("start_char:", start_char)
-                        print("end_char:", end_char)
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)

     #print("custom_query_list:", custom_query_list)
     if not text:
+        out_message = "No text data found. Skipping page."
         print(out_message)
+        return all_start_positions, all_end_positions
     for string_query in custom_query_list:
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    #print("Levenshtein distance:", distance)
                     if distance > spelling_mistakes_max:
                         match_count = match_count - 1
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                        #print("start_char:", start_char)
+                        #print("end_char:", end_char)
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)

tools/redaction_review.py CHANGED Viewed

@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         page_num_reported = 1
         out_image_annotator = image_annotator(
-        image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
         box_thickness=1,
         label_list=recogniser_entities_list,

         page_num_reported = 1
         out_image_annotator = image_annotator(
+        None,
         boxes_alpha=0.1,
         box_thickness=1,
         label_list=recogniser_entities_list,