seanpedrickcase commited on
Commit
613b1b4
·
1 Parent(s): e8681e8

Uploaded pdfs with review files will now include all pages that don't have redactions. Slightly improved deny list matching.

Browse files
app.py CHANGED
@@ -346,8 +346,6 @@ with app:
346
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
347
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
348
 
349
-
350
-
351
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
352
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
353
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
346
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
347
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
348
 
 
 
349
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
350
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
351
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
tools/file_conversion.py CHANGED
@@ -501,7 +501,7 @@ def prepare_image_or_pdf(
501
 
502
  elif file_extension in ['.csv']:
503
  review_file_csv = read_file(file)
504
- all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
505
  json_from_csv = True
506
  print("Converted CSV review file to json")
507
 
@@ -738,25 +738,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
738
 
739
  return df
740
 
741
- def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
 
 
 
742
  # Keep only necessary columns
743
  df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
744
 
745
  # Group the DataFrame by the 'image' column
746
- grouped = df.groupby('image')
747
 
748
  # Create a list to hold the JSON data
749
  json_data = []
750
 
751
- # Iterate over each group
752
- for image_path, group in grouped:
753
- # Convert each group to a list of box dictionaries
754
- boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
755
-
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  # Append the structured data to the json_data list
757
- json_data.append({
758
- "image": image_path,
759
- "boxes": boxes
760
- })
761
 
762
  return json_data
 
501
 
502
  elif file_extension in ['.csv']:
503
  review_file_csv = read_file(file)
504
+ all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
505
  json_from_csv = True
506
  print("Converted CSV review file to json")
507
 
 
738
 
739
  return df
740
 
741
+ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
742
+ '''
743
+ Convert a review csv to a json file for use by the Gradio Annotation object
744
+ '''
745
  # Keep only necessary columns
746
  df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
747
 
748
  # Group the DataFrame by the 'image' column
749
+ grouped_csv_pages = df.groupby('page')
750
 
751
  # Create a list to hold the JSON data
752
  json_data = []
753
 
754
+ for n, pdf_image_path in enumerate(image_paths):
755
+ reported_page_number = int(n + 1)
756
+
757
+ if reported_page_number in df["page"].values:
758
+
759
+ # Convert each relevant group to a list of box dictionaries
760
+ selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
761
+ annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
762
+
763
+ annotation = {
764
+ "image": pdf_image_path,
765
+ "boxes": annotation_boxes
766
+ }
767
+
768
+ else:
769
+ annotation = {}
770
+ annotation["image"] = pdf_image_path
771
+
772
  # Append the structured data to the json_data list
773
+ json_data.append(annotation)
 
 
 
774
 
775
  return json_data
tools/file_redaction.py CHANGED
@@ -1307,8 +1307,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1307
  character_objects_out.append(char) # Collect character objects
1308
 
1309
  if isinstance(char, LTAnno):
 
 
 
 
 
 
1310
  # Handle space separately by finalizing the word
1311
- full_text += char.get_text() # Adds space or newline
1312
 
1313
  if current_word: # Only finalize if there is a current word
1314
  word_bboxes.append((current_word, current_word_bbox))
@@ -1316,7 +1322,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1316
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1317
 
1318
  # Check for line break (assuming a new line is indicated by a specific character)
1319
- if '\n' in char.get_text():
1320
  #print("char_anno:", char)
1321
  # Finalize the current line
1322
  if current_word:
@@ -1335,7 +1341,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1335
 
1336
  # Concatenate text for LTChar
1337
 
1338
-
1339
  #full_text += char.get_text()
1340
  #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
1341
  added_text = char.get_text()
@@ -1344,8 +1349,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1344
  added_text = clean_unicode_text(added_text)
1345
  full_text += added_text # Adds space or newline, removing
1346
 
1347
-
1348
-
1349
  # Update overall bounding box
1350
  x0, y0, x1, y1 = char.bbox
1351
  overall_bbox[0] = min(overall_bbox[0], x0) # x0
 
1307
  character_objects_out.append(char) # Collect character objects
1308
 
1309
  if isinstance(char, LTAnno):
1310
+
1311
+ added_text = char.get_text()
1312
+
1313
+ # Handle double quotes
1314
+ added_text = added_text.replace('"', '\\"') # Escape double quotes
1315
+
1316
  # Handle space separately by finalizing the word
1317
+ full_text += added_text # Adds space or newline
1318
 
1319
  if current_word: # Only finalize if there is a current word
1320
  word_bboxes.append((current_word, current_word_bbox))
 
1322
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1323
 
1324
  # Check for line break (assuming a new line is indicated by a specific character)
1325
+ if '\n' in added_text:
1326
  #print("char_anno:", char)
1327
  # Finalize the current line
1328
  if current_word:
 
1341
 
1342
  # Concatenate text for LTChar
1343
 
 
1344
  #full_text += char.get_text()
1345
  #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
1346
  added_text = char.get_text()
 
1349
  added_text = clean_unicode_text(added_text)
1350
  full_text += added_text # Adds space or newline, removing
1351
 
 
 
1352
  # Update overall bounding box
1353
  x0, y0, x1, y1 = char.bbox
1354
  overall_bbox[0] = min(overall_bbox[0], x0) # x0
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -26,7 +26,8 @@ except:
26
  # #### Custom recognisers
27
  # Allow user to create their own recogniser
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
- custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
 
30
  custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
31
 
32
  #print("custom_pattern:", custom_pattern)
 
26
  # #### Custom recognisers
27
  # Allow user to create their own recogniser
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
+ #custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}" for term in custom_list) + '\\b'
30
+ custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
31
  custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
32
 
33
  #print("custom_pattern:", custom_pattern)