seanpedrickcase commited on
Commit
760ef5c
·
1 Parent(s): 20d940b

Corrected image coordinate translation when the pdf mediabox is not the same size as pdf page rectangle

Browse files
Files changed (2) hide show
  1. tools/file_conversion.py +124 -27
  2. tools/redaction_review.py +7 -2
tools/file_conversion.py CHANGED
@@ -304,44 +304,138 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
  '''
309
  Converts coordinates from pymupdf format to image coordinates,
310
- accounting for mediabox dimensions.
311
  '''
 
 
 
 
312
 
313
- rect_height = pymupdf_page.rect.height
314
- rect_width = pymupdf_page.rect.width
315
-
316
- # Get mediabox dimensions
317
  mediabox = pymupdf_page.mediabox
318
  mediabox_width = mediabox.width
319
  mediabox_height = mediabox.height
320
 
 
321
  image_page_width, image_page_height = image.size
322
 
323
- # Calculate scaling factors using mediabox dimensions
324
- scale_width = image_page_width / mediabox_width
325
- scale_height = image_page_height / mediabox_height
 
 
 
326
 
327
- #print("scale_width:", scale_width)
328
- #print("scale_height:", scale_height)
 
329
 
330
- rect_to_mediabox_x_scale = mediabox_width / rect_width
331
- rect_to_mediabox_y_scale = mediabox_height / rect_height
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
334
- #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
335
 
336
- # Adjust coordinates based on scaling factors
337
- x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
338
- y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
339
- x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
340
- y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
 
 
 
 
 
 
 
 
341
 
342
  return x1_image, y1_image, x2_image, y2_image
343
 
344
 
 
345
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
346
  # Small border to page that remains white
347
  border = 5
@@ -598,13 +692,16 @@ def prepare_image_or_pdf(
598
  all_annotations_object.append(annotation)
599
 
600
  #print("annotation:", annotation, "for page:", str(i))
601
-
602
- if not annotation:
603
- annotation = {"image":"", "boxes": []}
604
- annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
605
-
606
- else:
607
- annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
 
 
 
608
  #print("Annotation page number:", annotation_page_number)
609
 
610
  # Check if the annotation page number exists in the image file paths pages
@@ -744,7 +841,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
744
  #print(number) # Output: 0
745
  reported_number = int(number) + 1
746
  else:
747
- print("No number found before .png")
748
  reported_number = 1
749
 
750
  # Check if 'boxes' is in the annotation, if not, add an empty list
 
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
307
+ # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
+ # '''
309
+ # Converts coordinates from pymupdf format to image coordinates,
310
+ # accounting for mediabox dimensions and offset.
311
+ # '''
312
+ # # Get rect dimensions
313
+ # rect = pymupdf_page.rect
314
+ # rect_width = rect.width
315
+ # rect_height = rect.height
316
+
317
+ # # Get mediabox dimensions and position
318
+ # mediabox = pymupdf_page.mediabox
319
+ # mediabox_width = mediabox.width
320
+ # mediabox_height = mediabox.height
321
+
322
+ # # Get target image dimensions
323
+ # image_page_width, image_page_height = image.size
324
+
325
+ # # Calculate scaling factors
326
+ # image_to_mediabox_x_scale = image_page_width / mediabox_width
327
+ # image_to_mediabox_y_scale = image_page_height / mediabox_height
328
+
329
+ # image_to_rect_scale_width = image_page_width / rect_width
330
+ # image_to_rect_scale_height = image_page_height / rect_height
331
+
332
+ # # Adjust for offsets (difference in position between mediabox and rect)
333
+ # x_offset = rect.x0 - mediabox.x0 # Difference in x position
334
+ # y_offset = rect.y0 - mediabox.y0 # Difference in y position
335
+
336
+ # print("x_offset:", x_offset)
337
+ # print("y_offset:", y_offset)
338
+
339
+ # # Adjust coordinates:
340
+ # # Apply scaling to match image dimensions
341
+ # x1_image = x1 * image_to_mediabox_x_scale
342
+ # x2_image = x2 * image_to_mediabox_x_scale
343
+ # y1_image = y1 * image_to_mediabox_y_scale
344
+ # y2_image = y2 * image_to_mediabox_y_scale
345
+
346
+ # # Correct for difference in rect and mediabox size
347
+ # if mediabox_width != rect_width:
348
+
349
+ # mediabox_to_rect_x_scale = mediabox_width / rect_width
350
+ # mediabox_to_rect_y_scale = mediabox_height / rect_height
351
+
352
+ # x1_image *= mediabox_to_rect_x_scale
353
+ # x2_image *= mediabox_to_rect_x_scale
354
+ # y1_image *= mediabox_to_rect_y_scale
355
+ # y2_image *= mediabox_to_rect_y_scale
356
+
357
+ # print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
358
+ # #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
359
+
360
+ # print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
361
+ # #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
362
+
363
+ # mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
364
+ # mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
365
+
366
+ # x1_image -= mediabox_rect_x_diff
367
+ # x2_image -= mediabox_rect_x_diff
368
+ # y1_image += mediabox_rect_y_diff
369
+ # y2_image += mediabox_rect_y_diff
370
+
371
+ # return x1_image, y1_image, x2_image, y2_image
372
+
373
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
374
  '''
375
  Converts coordinates from pymupdf format to image coordinates,
376
+ accounting for mediabox dimensions and offset.
377
  '''
378
+ # Get rect dimensions
379
+ rect = pymupdf_page.rect
380
+ rect_width = rect.width
381
+ rect_height = rect.height
382
 
383
+ # Get mediabox dimensions and position
 
 
 
384
  mediabox = pymupdf_page.mediabox
385
  mediabox_width = mediabox.width
386
  mediabox_height = mediabox.height
387
 
388
+ # Get target image dimensions
389
  image_page_width, image_page_height = image.size
390
 
391
+ # Calculate scaling factors
392
+ image_to_mediabox_x_scale = image_page_width / mediabox_width
393
+ image_to_mediabox_y_scale = image_page_height / mediabox_height
394
+
395
+ image_to_rect_scale_width = image_page_width / rect_width
396
+ image_to_rect_scale_height = image_page_height / rect_height
397
 
398
+ # Adjust for offsets (difference in position between mediabox and rect)
399
+ x_offset = rect.x0 - mediabox.x0 # Difference in x position
400
+ y_offset = rect.y0 - mediabox.y0 # Difference in y position
401
 
402
+ #print("x_offset:", x_offset)
403
+ #print("y_offset:", y_offset)
404
+
405
+ # Adjust coordinates:
406
+ # Apply scaling to match image dimensions
407
+ x1_image = x1 * image_to_mediabox_x_scale
408
+ x2_image = x2 * image_to_mediabox_x_scale
409
+ y1_image = y1 * image_to_mediabox_y_scale
410
+ y2_image = y2 * image_to_mediabox_y_scale
411
+
412
+ # Correct for difference in rect and mediabox size
413
+ if mediabox_width != rect_width:
414
+
415
+ mediabox_to_rect_x_scale = mediabox_width / rect_width
416
+ mediabox_to_rect_y_scale = mediabox_height / rect_height
417
 
418
+ rect_to_mediabox_x_scale = rect_width / mediabox_width
419
+ #rect_to_mediabox_y_scale = rect_height / mediabox_height
420
 
421
+ mediabox_rect_x_diff = (mediabox_width - rect_width) * (image_to_mediabox_x_scale / 2)
422
+ mediabox_rect_y_diff = (mediabox_height - rect_height) * (image_to_mediabox_y_scale / 2)
423
+
424
+ x1_image -= mediabox_rect_x_diff
425
+ x2_image -= mediabox_rect_x_diff
426
+ y1_image += mediabox_rect_y_diff
427
+ y2_image += mediabox_rect_y_diff
428
+
429
+ #
430
+ x1_image *= mediabox_to_rect_x_scale
431
+ x2_image *= mediabox_to_rect_x_scale
432
+ y1_image *= mediabox_to_rect_y_scale
433
+ y2_image *= mediabox_to_rect_y_scale
434
 
435
  return x1_image, y1_image, x2_image, y2_image
436
 
437
 
438
+
439
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
440
  # Small border to page that remains white
441
  border = 5
 
692
  all_annotations_object.append(annotation)
693
 
694
  #print("annotation:", annotation, "for page:", str(i))
695
+ try:
696
+ if not annotation:
697
+ annotation = {"image":"", "boxes": []}
698
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
699
+
700
+ else:
701
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
702
+ except Exception as e:
703
+ print("Extracting page number from image failed due to:", e)
704
+ annotation_page_number = 0
705
  #print("Annotation page number:", annotation_page_number)
706
 
707
  # Check if the annotation page number exists in the image file paths pages
 
841
  #print(number) # Output: 0
842
  reported_number = int(number) + 1
843
  else:
844
+ print("No number found before .png. Returning page 1.")
845
  reported_number = 1
846
 
847
  # Check if 'boxes' is in the annotation, if not, add an empty list
tools/redaction_review.py CHANGED
@@ -295,9 +295,14 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
295
  fill = img_annotation_box["color"]
296
 
297
  draw.rectangle(coords, fill=fill)
298
-
 
299
  image.save(output_folder + file_name_without_ext + "_redacted.png")
300
 
 
 
 
 
301
  doc = [image]
302
 
303
  elif file_extension in '.csv':
@@ -347,7 +352,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
347
  output_files.append(out_pdf_file_path)
348
 
349
  else:
350
- print("PDF input not found.")
351
 
352
  # If save_pdf is not true, then add the original pdf to the output files
353
  else:
 
295
  fill = img_annotation_box["color"]
296
 
297
  draw.rectangle(coords, fill=fill)
298
+
299
+ output_image_path = output_folder + file_name_without_ext + "_redacted.png"
300
  image.save(output_folder + file_name_without_ext + "_redacted.png")
301
 
302
+ output_files.append(output_image_path)
303
+
304
+ print("Redactions saved to image file")
305
+
306
  doc = [image]
307
 
308
  elif file_extension in '.csv':
 
352
  output_files.append(out_pdf_file_path)
353
 
354
  else:
355
+ print("PDF input not found. Outputs not saved to PDF.")
356
 
357
  # If save_pdf is not true, then add the original pdf to the output files
358
  else: