Spaces:
Sleeping
Sleeping
Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.
ec98119
import gradio as gr | |
import numpy as np | |
from typing import List | |
from gradio_image_annotation import image_annotator | |
from gradio_image_annotation.image_annotator import AnnotatedImageData | |
from tools.file_conversion import is_pdf, convert_pdf_to_images | |
from tools.helper_functions import get_file_path_end, output_folder | |
from tools.file_redaction import redact_page_with_pymupdf | |
import json | |
import pymupdf | |
from fitz import Document | |
from PIL import ImageDraw, Image | |
def decrease_page(number:int): | |
''' | |
Decrease page number for review redactions page. | |
''' | |
#print("number:", str(number)) | |
if number > 1: | |
return number - 1, number - 1 | |
else: | |
return 1, 1 | |
def increase_page(number:int, image_annotator_object:AnnotatedImageData): | |
''' | |
Increase page number for review redactions page. | |
''' | |
if not image_annotator_object: | |
return 1, 1 | |
max_pages = len(image_annotator_object) | |
if number < max_pages: | |
return number + 1, number + 1 | |
else: | |
return max_pages, max_pages | |
def update_zoom(current_zoom_level:int, decrease:bool=True): | |
if decrease == False: | |
if current_zoom_level >= 50: | |
current_zoom_level -= 10 | |
else: | |
if current_zoom_level < 100: | |
current_zoom_level += 10 | |
return current_zoom_level | |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100): | |
# print("\nImage annotator object:", image_annotator_object) | |
zoom_str = str(zoom) + '%' | |
if not image_annotator_object: | |
return image_annotator( | |
label="Modify redaction boxes", | |
#label_list=["Redaction"], | |
#label_colors=[(0, 0, 0)], | |
show_label=False, | |
sources=["upload"], | |
show_clear_button=False, | |
show_share_button=False, | |
show_remove_button=False, | |
interactive=False | |
), gr.Number(label = "Page (press enter to change)", value=1, precision=0) | |
if page_num is None: | |
page_num = 0 | |
# Check bounding values for current page and page max | |
if page_num > 0: | |
page_num_reported = page_num | |
#page_num = page_num - 1 | |
elif page_num == 0: page_num_reported = 1 | |
else: | |
page_num = 0 | |
page_num_reported = 1 | |
page_max_reported = len(image_annotator_object) | |
if page_num_reported > page_max_reported: | |
page_num_reported = page_max_reported | |
out_image_annotator = image_annotator(value = image_annotator_object[page_num_reported - 1], | |
boxes_alpha=0.1, | |
box_thickness=1, | |
#label_list=["Redaction"], | |
#label_colors=[(0, 0, 0)], | |
show_label=False, | |
height=zoom_str, | |
width=zoom_str, | |
box_min_size=1, | |
box_selected_thickness=2, | |
handle_size=4, | |
sources=None,#["upload"], | |
show_clear_button=False, | |
show_share_button=False, | |
show_remove_button=False, | |
handles_cursor=True, | |
interactive=True | |
) | |
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0) | |
return out_image_annotator, number_reported, number_reported | |
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]): | |
''' | |
Overwrite current image annotations with modifications | |
''' | |
#If no previous page or is 0, i.e. first time run, then make no changes | |
if not previous_page: | |
return all_image_annotations, current_page, current_page | |
if not current_page: | |
current_page = 1 | |
#print("all_image_annotations before:",all_image_annotations) | |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"] | |
#print("image_annotated:", image_annotated) | |
all_image_annotations[previous_page - 1] = image_annotated | |
#print("all_image_annotations after:",all_image_annotations) | |
return all_image_annotations, current_page, current_page | |
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)): | |
''' | |
Apply modified redactions to a pymupdf | |
''' | |
#print("all_image_annotations:", all_image_annotations) | |
output_files = [] | |
image_annotated['image'] = all_image_annotations[current_page - 1]["image"] | |
all_image_annotations[current_page - 1] = image_annotated | |
if not image_annotated: | |
print("No image annotations found") | |
return doc, all_image_annotations | |
if isinstance(file_paths, list): | |
file_path = file_paths[-1].name | |
else: | |
file_path = file_paths | |
print("file_path:", file_path) | |
file_base = get_file_path_end(file_path) | |
# If working with image docs | |
if is_pdf(file_path) == False: | |
unredacted_doc = Image.open(file_paths[-1]) | |
image = unredacted_doc | |
# try: | |
# image = Image.open(image_annotated['image']) | |
# except: | |
# image = Image.fromarray(image_annotated['image'].astype('uint8')) | |
draw = ImageDraw.Draw(unredacted_doc) | |
for img_annotation_box in image_annotated['boxes']: | |
coords = [img_annotation_box["xmin"], | |
img_annotation_box["ymin"], | |
img_annotation_box["xmax"], | |
img_annotation_box["ymax"]] | |
fill = img_annotation_box["color"] | |
draw.rectangle(coords, fill=fill) | |
image.save(output_folder + file_base + "_redacted_mod.png") | |
doc = [image] | |
# If working with pdfs | |
else: | |
unredacted_doc = pymupdf.open(file_path) | |
number_of_pages = unredacted_doc.page_count | |
print("Saving pages to file.") | |
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"): | |
#print("Saving page", str(i)) | |
image_loc = all_image_annotations[i]['image'] | |
#print("Image location:", image_loc) | |
# Load in image object | |
if isinstance(image_loc, np.ndarray): | |
image = Image.fromarray(image_loc.astype('uint8')) | |
#all_image_annotations[i]['image'] = image_loc.tolist() | |
elif isinstance(image_loc, Image.Image): | |
image = image_loc | |
#image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png" | |
#image_loc.save(image_out_folder) | |
#all_image_annotations[i]['image'] = image_out_folder | |
elif isinstance(image_loc, str): | |
image = Image.open(image_loc) | |
pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1) | |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image) | |
#try: | |
out_pdf_file_path = output_folder + file_base + "_redacted_mod.pdf" | |
unredacted_doc.save(out_pdf_file_path) | |
output_files.append(out_pdf_file_path) | |
# Save the gradio_annotation_boxes to a JSON file | |
try: | |
out_annotation_file_path = output_folder + file_base + '_modified_redactions.json' | |
with open(out_annotation_file_path, 'w') as f: | |
json.dump(all_image_annotations, f) | |
output_files.append(out_annotation_file_path) | |
except: | |
print("Could not save annotations to json file.") | |
return doc, all_image_annotations, output_files | |
def crop(annotations:AnnotatedImageData): | |
if annotations["boxes"]: | |
box = annotations["boxes"][0] | |
return annotations["image"][ | |
box["ymin"]:box["ymax"], | |
box["xmin"]:box["xmax"] | |
] | |
return None | |
def get_boxes_json(annotations:AnnotatedImageData): | |
return annotations["boxes"] | |