Commit
·
3cecbfa
1
Parent(s):
760ef5c
Fixed issues with gradio version 5.16. Fixed fuzzy search error with pages with no data.
Browse files- requirements.txt +1 -2
- tools/file_redaction.py +11 -4
- tools/load_spacy_model_custom_recognisers.py +8 -8
- tools/redaction_review.py +1 -1
requirements.txt
CHANGED
@@ -12,7 +12,7 @@ scikit-learn==1.5.2
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.
|
16 |
boto3==1.36.15
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
@@ -22,7 +22,6 @@ spaczz==0.6.1
|
|
22 |
#gradio_image_annotation==0.2.5
|
23 |
# The following version includes rotation and image zoom options
|
24 |
git+https://github.com/seanpedrick-case/gradio_image_annotator
|
25 |
-
|
26 |
rapidfuzz==3.12.1
|
27 |
numpy==1.26.4
|
28 |
awslambdaric==3.0.0
|
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
+
gradio==5.16.0
|
16 |
boto3==1.36.15
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
|
|
22 |
#gradio_image_annotation==0.2.5
|
23 |
# The following version includes rotation and image zoom options
|
24 |
git+https://github.com/seanpedrick-case/gradio_image_annotator
|
|
|
25 |
rapidfuzz==3.12.1
|
26 |
numpy==1.26.4
|
27 |
awslambdaric==3.0.0
|
tools/file_redaction.py
CHANGED
@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
144 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
145 |
|
146 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
147 |
-
|
|
|
|
|
|
|
|
|
148 |
|
149 |
# Sort the strings in order from the longest string to the shortest
|
150 |
custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
|
151 |
|
152 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
155 |
|
156 |
# If this is the first time around, set variables to 0/blank
|
157 |
if first_loop_state==True:
|
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
|
|
1209 |
|
1210 |
## Apply annotations with pymupdf
|
1211 |
else:
|
1212 |
-
print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1213 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1214 |
if redact_whole_page_list:
|
1215 |
int_reported_page_number = int(reported_page_number)
|
|
|
144 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
145 |
|
146 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
147 |
+
if not custom_recogniser_word_list.empty:
|
148 |
+
custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
|
149 |
+
else:
|
150 |
+
# Handle the case where the DataFrame is empty
|
151 |
+
custom_recogniser_word_list = [] # or some default value
|
152 |
|
153 |
# Sort the strings in order from the longest string to the shortest
|
154 |
custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
|
155 |
|
156 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
157 |
+
if not redact_whole_page_list.empty:
|
158 |
+
redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
|
159 |
+
else:
|
160 |
+
# Handle the case where the DataFrame is empty
|
161 |
+
redact_whole_page_list = [] # or some default value
|
162 |
|
163 |
# If this is the first time around, set variables to 0/blank
|
164 |
if first_loop_state==True:
|
|
|
1216 |
|
1217 |
## Apply annotations with pymupdf
|
1218 |
else:
|
1219 |
+
#print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1220 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1221 |
if redact_whole_page_list:
|
1222 |
int_reported_page_number = int(reported_page_number)
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
184 |
#print("custom_query_list:", custom_query_list)
|
185 |
|
186 |
if not text:
|
187 |
-
out_message = "
|
188 |
print(out_message)
|
189 |
-
return
|
190 |
|
191 |
for string_query in custom_query_list:
|
192 |
|
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
254 |
for match_id, start, end, ratio, pattern in matches:
|
255 |
span = str(doc[start:end]).strip()
|
256 |
query_search = str(query).strip()
|
257 |
-
print("doc:", doc)
|
258 |
-
print("span:", span)
|
259 |
-
print("query_search:", query_search)
|
260 |
|
261 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
262 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
263 |
|
264 |
-
print("Levenshtein distance:", distance)
|
265 |
|
266 |
if distance > spelling_mistakes_max:
|
267 |
match_count = match_count - 1
|
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
270 |
start_char = doc[start].idx # Start character position
|
271 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
272 |
|
273 |
-
print("start_char:", start_char)
|
274 |
-
print("end_char:", end_char)
|
275 |
|
276 |
all_matches.append(match_count)
|
277 |
all_start_positions.append(start_char)
|
|
|
184 |
#print("custom_query_list:", custom_query_list)
|
185 |
|
186 |
if not text:
|
187 |
+
out_message = "No text data found. Skipping page."
|
188 |
print(out_message)
|
189 |
+
return all_start_positions, all_end_positions
|
190 |
|
191 |
for string_query in custom_query_list:
|
192 |
|
|
|
254 |
for match_id, start, end, ratio, pattern in matches:
|
255 |
span = str(doc[start:end]).strip()
|
256 |
query_search = str(query).strip()
|
257 |
+
#print("doc:", doc)
|
258 |
+
#print("span:", span)
|
259 |
+
#print("query_search:", query_search)
|
260 |
|
261 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
262 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
263 |
|
264 |
+
#print("Levenshtein distance:", distance)
|
265 |
|
266 |
if distance > spelling_mistakes_max:
|
267 |
match_count = match_count - 1
|
|
|
270 |
start_char = doc[start].idx # Start character position
|
271 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
272 |
|
273 |
+
#print("start_char:", start_char)
|
274 |
+
#print("end_char:", end_char)
|
275 |
|
276 |
all_matches.append(match_count)
|
277 |
all_start_positions.append(start_char)
|
tools/redaction_review.py
CHANGED
@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
137 |
page_num_reported = 1
|
138 |
|
139 |
out_image_annotator = image_annotator(
|
140 |
-
|
141 |
boxes_alpha=0.1,
|
142 |
box_thickness=1,
|
143 |
label_list=recogniser_entities_list,
|
|
|
137 |
page_num_reported = 1
|
138 |
|
139 |
out_image_annotator = image_annotator(
|
140 |
+
None,
|
141 |
boxes_alpha=0.1,
|
142 |
box_thickness=1,
|
143 |
label_list=recogniser_entities_list,
|