Commit
·
3518b67
1
Parent(s):
3187788
Corrected large image reduction code
Browse files- app.py +2 -5
- tools/file_conversion.py +5 -5
- tools/redaction_review.py +2 -0
app.py
CHANGED
|
@@ -371,11 +371,8 @@ with app:
|
|
| 371 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 372 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 373 |
|
| 374 |
-
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
| 375 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
| 376 |
|
| 377 |
-
do_not_save_pdf_state
|
| 378 |
-
|
| 379 |
# Page controls at bottom
|
| 380 |
annotate_current_page_bottom.submit(
|
| 381 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
@@ -392,10 +389,10 @@ with app:
|
|
| 392 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 393 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
| 394 |
|
| 395 |
-
# Review
|
| 396 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
| 397 |
|
| 398 |
-
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
| 399 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 400 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 401 |
|
|
|
|
| 371 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 372 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 373 |
|
|
|
|
| 374 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
| 375 |
|
|
|
|
|
|
|
| 376 |
# Page controls at bottom
|
| 377 |
annotate_current_page_bottom.submit(
|
| 378 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
|
| 389 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 390 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
| 391 |
|
| 392 |
+
# Review table controls
|
| 393 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
| 394 |
|
| 395 |
+
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
| 396 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 397 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 398 |
|
tools/file_conversion.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
| 2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
| 3 |
from PIL import Image, ImageFile
|
| 4 |
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
import time
|
|
@@ -16,6 +15,7 @@ from typing import List, Optional
|
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
|
| 18 |
image_dpi = 300.0
|
|
|
|
| 19 |
Image.MAX_IMAGE_PIXELS = None
|
| 20 |
|
| 21 |
def is_pdf_or_image(filename):
|
|
@@ -75,7 +75,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
| 75 |
image.save(out_path, format="PNG")
|
| 76 |
|
| 77 |
# Check file size and resize if necessary
|
| 78 |
-
max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
|
| 79 |
file_size = os.path.getsize(out_path)
|
| 80 |
|
| 81 |
# Resize images if they are too big
|
|
@@ -83,7 +83,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
| 83 |
# Start with the original image size
|
| 84 |
width, height = image.size
|
| 85 |
|
| 86 |
-
print(f"Image size before {
|
| 87 |
|
| 88 |
while file_size > max_size:
|
| 89 |
# Reduce the size by a factor (e.g., 50% of the current size)
|
|
@@ -107,9 +107,9 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
| 107 |
print(f"Error processing page {page_num + 1}: {e}")
|
| 108 |
return page_num, None
|
| 109 |
|
| 110 |
-
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float =
|
| 111 |
|
| 112 |
-
# If preparing for review, just load the first page
|
| 113 |
if prepare_for_review == True:
|
| 114 |
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
| 115 |
else:
|
|
|
|
| 1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
| 2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
| 3 |
from PIL import Image, ImageFile
|
|
|
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import time
|
|
|
|
| 15 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
|
| 17 |
image_dpi = 300.0
|
| 18 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
| 19 |
Image.MAX_IMAGE_PIXELS = None
|
| 20 |
|
| 21 |
def is_pdf_or_image(filename):
|
|
|
|
| 75 |
image.save(out_path, format="PNG")
|
| 76 |
|
| 77 |
# Check file size and resize if necessary
|
| 78 |
+
max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
|
| 79 |
file_size = os.path.getsize(out_path)
|
| 80 |
|
| 81 |
# Resize images if they are too big
|
|
|
|
| 83 |
# Start with the original image size
|
| 84 |
width, height = image.size
|
| 85 |
|
| 86 |
+
print(f"Image size before {width}x{height}, original file_size: {file_size}")
|
| 87 |
|
| 88 |
while file_size > max_size:
|
| 89 |
# Reduce the size by a factor (e.g., 50% of the current size)
|
|
|
|
| 107 |
print(f"Error processing page {page_num + 1}: {e}")
|
| 108 |
return page_num, None
|
| 109 |
|
| 110 |
+
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
|
| 111 |
|
| 112 |
+
# If preparing for review, just load the first page (not used)
|
| 113 |
if prepare_for_review == True:
|
| 114 |
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
| 115 |
else:
|
tools/redaction_review.py
CHANGED
|
@@ -15,6 +15,8 @@ from fitz import Document
|
|
| 15 |
from PIL import ImageDraw, Image
|
| 16 |
from collections import defaultdict
|
| 17 |
|
|
|
|
|
|
|
| 18 |
def decrease_page(number:int):
|
| 19 |
'''
|
| 20 |
Decrease page number for review redactions page.
|
|
|
|
| 15 |
from PIL import ImageDraw, Image
|
| 16 |
from collections import defaultdict
|
| 17 |
|
| 18 |
+
Image.MAX_IMAGE_PIXELS = None
|
| 19 |
+
|
| 20 |
def decrease_page(number:int):
|
| 21 |
'''
|
| 22 |
Decrease page number for review redactions page.
|