Spaces:
Sleeping
Sleeping
Commit
·
613b1b4
1
Parent(s):
e8681e8
Uploaded pdfs with review files will now include all pages that don't have redactions. Slightly improved deny list matching.
Browse files- app.py +0 -2
- tools/file_conversion.py +25 -12
- tools/file_redaction.py +8 -5
- tools/load_spacy_model_custom_recognisers.py +2 -1
app.py
CHANGED
@@ -346,8 +346,6 @@ with app:
|
|
346 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
347 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
348 |
|
349 |
-
|
350 |
-
|
351 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
352 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
353 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
346 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
347 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
348 |
|
|
|
|
|
349 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
350 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
351 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
tools/file_conversion.py
CHANGED
@@ -501,7 +501,7 @@ def prepare_image_or_pdf(
|
|
501 |
|
502 |
elif file_extension in ['.csv']:
|
503 |
review_file_csv = read_file(file)
|
504 |
-
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
|
505 |
json_from_csv = True
|
506 |
print("Converted CSV review file to json")
|
507 |
|
@@ -738,25 +738,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
|
|
738 |
|
739 |
return df
|
740 |
|
741 |
-
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
|
|
|
|
|
|
742 |
# Keep only necessary columns
|
743 |
df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
744 |
|
745 |
# Group the DataFrame by the 'image' column
|
746 |
-
|
747 |
|
748 |
# Create a list to hold the JSON data
|
749 |
json_data = []
|
750 |
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
756 |
# Append the structured data to the json_data list
|
757 |
-
json_data.append(
|
758 |
-
"image": image_path,
|
759 |
-
"boxes": boxes
|
760 |
-
})
|
761 |
|
762 |
return json_data
|
|
|
501 |
|
502 |
elif file_extension in ['.csv']:
|
503 |
review_file_csv = read_file(file)
|
504 |
+
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
|
505 |
json_from_csv = True
|
506 |
print("Converted CSV review file to json")
|
507 |
|
|
|
738 |
|
739 |
return df
|
740 |
|
741 |
+
def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
|
742 |
+
'''
|
743 |
+
Convert a review csv to a json file for use by the Gradio Annotation object
|
744 |
+
'''
|
745 |
# Keep only necessary columns
|
746 |
df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
747 |
|
748 |
# Group the DataFrame by the 'image' column
|
749 |
+
grouped_csv_pages = df.groupby('page')
|
750 |
|
751 |
# Create a list to hold the JSON data
|
752 |
json_data = []
|
753 |
|
754 |
+
for n, pdf_image_path in enumerate(image_paths):
|
755 |
+
reported_page_number = int(n + 1)
|
756 |
+
|
757 |
+
if reported_page_number in df["page"].values:
|
758 |
+
|
759 |
+
# Convert each relevant group to a list of box dictionaries
|
760 |
+
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
761 |
+
annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
|
762 |
+
|
763 |
+
annotation = {
|
764 |
+
"image": pdf_image_path,
|
765 |
+
"boxes": annotation_boxes
|
766 |
+
}
|
767 |
+
|
768 |
+
else:
|
769 |
+
annotation = {}
|
770 |
+
annotation["image"] = pdf_image_path
|
771 |
+
|
772 |
# Append the structured data to the json_data list
|
773 |
+
json_data.append(annotation)
|
|
|
|
|
|
|
774 |
|
775 |
return json_data
|
tools/file_redaction.py
CHANGED
@@ -1307,8 +1307,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1307 |
character_objects_out.append(char) # Collect character objects
|
1308 |
|
1309 |
if isinstance(char, LTAnno):
|
|
|
|
|
|
|
|
|
|
|
|
|
1310 |
# Handle space separately by finalizing the word
|
1311 |
-
full_text +=
|
1312 |
|
1313 |
if current_word: # Only finalize if there is a current word
|
1314 |
word_bboxes.append((current_word, current_word_bbox))
|
@@ -1316,7 +1322,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1316 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1317 |
|
1318 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1319 |
-
if '\n' in
|
1320 |
#print("char_anno:", char)
|
1321 |
# Finalize the current line
|
1322 |
if current_word:
|
@@ -1335,7 +1341,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1335 |
|
1336 |
# Concatenate text for LTChar
|
1337 |
|
1338 |
-
|
1339 |
#full_text += char.get_text()
|
1340 |
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1341 |
added_text = char.get_text()
|
@@ -1344,8 +1349,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1344 |
added_text = clean_unicode_text(added_text)
|
1345 |
full_text += added_text # Adds space or newline, removing
|
1346 |
|
1347 |
-
|
1348 |
-
|
1349 |
# Update overall bounding box
|
1350 |
x0, y0, x1, y1 = char.bbox
|
1351 |
overall_bbox[0] = min(overall_bbox[0], x0) # x0
|
|
|
1307 |
character_objects_out.append(char) # Collect character objects
|
1308 |
|
1309 |
if isinstance(char, LTAnno):
|
1310 |
+
|
1311 |
+
added_text = char.get_text()
|
1312 |
+
|
1313 |
+
# Handle double quotes
|
1314 |
+
added_text = added_text.replace('"', '\\"') # Escape double quotes
|
1315 |
+
|
1316 |
# Handle space separately by finalizing the word
|
1317 |
+
full_text += added_text # Adds space or newline
|
1318 |
|
1319 |
if current_word: # Only finalize if there is a current word
|
1320 |
word_bboxes.append((current_word, current_word_bbox))
|
|
|
1322 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1323 |
|
1324 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1325 |
+
if '\n' in added_text:
|
1326 |
#print("char_anno:", char)
|
1327 |
# Finalize the current line
|
1328 |
if current_word:
|
|
|
1341 |
|
1342 |
# Concatenate text for LTChar
|
1343 |
|
|
|
1344 |
#full_text += char.get_text()
|
1345 |
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1346 |
added_text = char.get_text()
|
|
|
1349 |
added_text = clean_unicode_text(added_text)
|
1350 |
full_text += added_text # Adds space or newline, removing
|
1351 |
|
|
|
|
|
1352 |
# Update overall bounding box
|
1353 |
x0, y0, x1, y1 = char.bbox
|
1354 |
overall_bbox[0] = min(overall_bbox[0], x0) # x0
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -26,7 +26,8 @@ except:
|
|
26 |
# #### Custom recognisers
|
27 |
# Allow user to create their own recogniser
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
-
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
|
|
|
30 |
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
31 |
|
32 |
#print("custom_pattern:", custom_pattern)
|
|
|
26 |
# #### Custom recognisers
|
27 |
# Allow user to create their own recogniser
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
+
#custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}" for term in custom_list) + '\\b'
|
30 |
+
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
|
31 |
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
32 |
|
33 |
#print("custom_pattern:", custom_pattern)
|