Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 14

Commit

613b1b4

1 Parent(s): e8681e8

Uploaded pdfs with review files will now include all pages that don't have redactions. Slightly improved deny list matching.

Browse files

Files changed (4) hide show

app.py +0 -2
tools/file_conversion.py +25 -12
tools/file_redaction.py +8 -5
tools/load_spacy_model_custom_recognisers.py +2 -1

app.py CHANGED Viewed

@@ -346,8 +346,6 @@ with app:
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])

         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])

tools/file_conversion.py CHANGED Viewed

@@ -501,7 +501,7 @@ def prepare_image_or_pdf(
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
-            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
             json_from_csv = True
             print("Converted CSV review file to json")
@@ -738,25 +738,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
     return df
-def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
     # Keep only necessary columns
     df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
-    grouped = df.groupby('image')
     # Create a list to hold the JSON data
     json_data = []
-    # Iterate over each group
-    for image_path, group in grouped:
-        # Convert each group to a list of box dictionaries
-        boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
         # Append the structured data to the json_data list
-        json_data.append({
-            "image": image_path,
-            "boxes": boxes
-        })
     return json_data

         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
+            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
             json_from_csv = True
             print("Converted CSV review file to json")
     return df
+def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
+    '''
+    Convert a review csv to a json file for use by the Gradio Annotation object
+    '''
     # Keep only necessary columns
     df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
+    grouped_csv_pages = df.groupby('page')
     # Create a list to hold the JSON data
     json_data = []
+    for n, pdf_image_path in enumerate(image_paths):
+        reported_page_number = int(n + 1)
+        if reported_page_number in df["page"].values:
+            # Convert each relevant group to a list of box dictionaries
+            selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
+            annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
+            annotation = {
+                "image": pdf_image_path,
+                "boxes": annotation_boxes
+            }
+        else:
+            annotation = {}
+            annotation["image"] = pdf_image_path
         # Append the structured data to the json_data list
+        json_data.append(annotation)
     return json_data

tools/file_redaction.py CHANGED Viewed

@@ -1307,8 +1307,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         character_objects_out.append(char)  # Collect character objects
         if isinstance(char, LTAnno):
             # Handle space separately by finalizing the word
-            full_text += char.get_text()  # Adds space or newline
             if current_word:  # Only finalize if there is a current word
                 word_bboxes.append((current_word, current_word_bbox))
@@ -1316,7 +1322,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
-            if '\n' in char.get_text():
                 #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
@@ -1335,7 +1341,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         # Concatenate text for LTChar
         #full_text += char.get_text()
         #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
         added_text = char.get_text()
@@ -1344,8 +1349,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             added_text = clean_unicode_text(added_text)
         full_text += added_text  # Adds space or newline, removing
         # Update overall bounding box
         x0, y0, x1, y1 = char.bbox
         overall_bbox[0] = min(overall_bbox[0], x0)  # x0

         character_objects_out.append(char)  # Collect character objects
         if isinstance(char, LTAnno):
+            added_text = char.get_text()
+            # Handle double quotes
+            added_text = added_text.replace('"', '\\"')  # Escape double quotes
             # Handle space separately by finalizing the word
+            full_text += added_text  # Adds space or newline
             if current_word:  # Only finalize if there is a current word
                 word_bboxes.append((current_word, current_word_bbox))
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
+            if '\n' in added_text:
                 #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
         # Concatenate text for LTChar
         #full_text += char.get_text()
         #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
         added_text = char.get_text()
             added_text = clean_unicode_text(added_text)
         full_text += added_text  # Adds space or newline, removing
         # Update overall bounding box
         x0, y0, x1, y1 = char.bbox
         overall_bbox[0] = min(overall_bbox[0], x0)  # x0

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -26,7 +26,8 @@ except:
 # #### Custom recognisers
 # Allow user to create their own recogniser
 def custom_word_list_recogniser(custom_list:List[str]=[]):
-    custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
     custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
     #print("custom_pattern:", custom_pattern)

 # #### Custom recognisers
 # Allow user to create their own recogniser
 def custom_word_list_recogniser(custom_list:List[str]=[]):
+    #custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}" for term in custom_list) + '\\b'
+    custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
     custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
     #print("custom_pattern:", custom_pattern)