seanpedrickcase commited on
Commit
3cecbfa
·
1 Parent(s): 760ef5c

Fixed issues with gradio version 5.16. Fixed fuzzy search error with pages with no data.

Browse files
requirements.txt CHANGED
@@ -12,7 +12,7 @@ scikit-learn==1.5.2
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.15.0
16
  boto3==1.36.15
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
@@ -22,7 +22,6 @@ spaczz==0.6.1
22
  #gradio_image_annotation==0.2.5
23
  # The following version includes rotation and image zoom options
24
  git+https://github.com/seanpedrick-case/gradio_image_annotator
25
-
26
  rapidfuzz==3.12.1
27
  numpy==1.26.4
28
  awslambdaric==3.0.0
 
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.16.0
16
  boto3==1.36.15
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
 
22
  #gradio_image_annotation==0.2.5
23
  # The following version includes rotation and image zoom options
24
  git+https://github.com/seanpedrick-case/gradio_image_annotator
 
25
  rapidfuzz==3.12.1
26
  numpy==1.26.4
27
  awslambdaric==3.0.0
tools/file_redaction.py CHANGED
@@ -144,14 +144,21 @@ def choose_and_run_redactor(file_paths:List[str],
144
  review_out_file_paths = [prepared_pdf_file_paths[0]]
145
 
146
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
147
- custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
 
 
 
 
148
 
149
  # Sort the strings in order from the longest string to the shortest
150
  custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
151
 
152
  if isinstance(redact_whole_page_list, pd.DataFrame):
153
- redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
154
-
 
 
 
155
 
156
  # If this is the first time around, set variables to 0/blank
157
  if first_loop_state==True:
@@ -1209,7 +1216,7 @@ def redact_image_pdf(file_path:str,
1209
 
1210
  ## Apply annotations with pymupdf
1211
  else:
1212
- print("merged_redaction_boxes:", merged_redaction_bboxes)
1213
  #print("redact_whole_page_list:", redact_whole_page_list)
1214
  if redact_whole_page_list:
1215
  int_reported_page_number = int(reported_page_number)
 
144
  review_out_file_paths = [prepared_pdf_file_paths[0]]
145
 
146
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
147
+ if not custom_recogniser_word_list.empty:
148
+ custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
149
+ else:
150
+ # Handle the case where the DataFrame is empty
151
+ custom_recogniser_word_list = [] # or some default value
152
 
153
  # Sort the strings in order from the longest string to the shortest
154
  custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
155
 
156
  if isinstance(redact_whole_page_list, pd.DataFrame):
157
+ if not redact_whole_page_list.empty:
158
+ redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
159
+ else:
160
+ # Handle the case where the DataFrame is empty
161
+ redact_whole_page_list = [] # or some default value
162
 
163
  # If this is the first time around, set variables to 0/blank
164
  if first_loop_state==True:
 
1216
 
1217
  ## Apply annotations with pymupdf
1218
  else:
1219
+ #print("merged_redaction_boxes:", merged_redaction_bboxes)
1220
  #print("redact_whole_page_list:", redact_whole_page_list)
1221
  if redact_whole_page_list:
1222
  int_reported_page_number = int(reported_page_number)
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -184,9 +184,9 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
184
  #print("custom_query_list:", custom_query_list)
185
 
186
  if not text:
187
- out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
188
  print(out_message)
189
- return out_message, None
190
 
191
  for string_query in custom_query_list:
192
 
@@ -254,14 +254,14 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
254
  for match_id, start, end, ratio, pattern in matches:
255
  span = str(doc[start:end]).strip()
256
  query_search = str(query).strip()
257
- print("doc:", doc)
258
- print("span:", span)
259
- print("query_search:", query_search)
260
 
261
  # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
  distance = Levenshtein.distance(query_search.lower(), span.lower())
263
 
264
- print("Levenshtein distance:", distance)
265
 
266
  if distance > spelling_mistakes_max:
267
  match_count = match_count - 1
@@ -270,8 +270,8 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
270
  start_char = doc[start].idx # Start character position
271
  end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
 
273
- print("start_char:", start_char)
274
- print("end_char:", end_char)
275
 
276
  all_matches.append(match_count)
277
  all_start_positions.append(start_char)
 
184
  #print("custom_query_list:", custom_query_list)
185
 
186
  if not text:
187
+ out_message = "No text data found. Skipping page."
188
  print(out_message)
189
+ return all_start_positions, all_end_positions
190
 
191
  for string_query in custom_query_list:
192
 
 
254
  for match_id, start, end, ratio, pattern in matches:
255
  span = str(doc[start:end]).strip()
256
  query_search = str(query).strip()
257
+ #print("doc:", doc)
258
+ #print("span:", span)
259
+ #print("query_search:", query_search)
260
 
261
  # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
  distance = Levenshtein.distance(query_search.lower(), span.lower())
263
 
264
+ #print("Levenshtein distance:", distance)
265
 
266
  if distance > spelling_mistakes_max:
267
  match_count = match_count - 1
 
270
  start_char = doc[start].idx # Start character position
271
  end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
 
273
+ #print("start_char:", start_char)
274
+ #print("end_char:", end_char)
275
 
276
  all_matches.append(match_count)
277
  all_start_positions.append(start_char)
tools/redaction_review.py CHANGED
@@ -137,7 +137,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
137
  page_num_reported = 1
138
 
139
  out_image_annotator = image_annotator(
140
- image_annotator_object[page_num_reported - 1],
141
  boxes_alpha=0.1,
142
  box_thickness=1,
143
  label_list=recogniser_entities_list,
 
137
  page_num_reported = 1
138
 
139
  out_image_annotator = image_annotator(
140
+ None,
141
  boxes_alpha=0.1,
142
  box_thickness=1,
143
  label_list=recogniser_entities_list,