seanpedrickcase commited on
Commit
15026f7
·
1 Parent(s): d8c98c8

Adjusted outputs correctly for situations where the pdf mediabox size is different from the visible page size

Browse files
Files changed (1) hide show
  1. tools/file_redaction.py +38 -19
tools/file_redaction.py CHANGED
@@ -225,30 +225,47 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
225
 
226
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
227
 
228
- def redact_page_with_pymupdf(doc, annotations_on_page, page_no, scale=(1,1)):
229
 
230
  page = doc.load_page(page_no)
231
- page_height = max(page.rect.height, page.mediabox[3] - page.mediabox[1])
 
 
 
232
 
233
  #print("page_rect_height:", page.rect.height)
234
  #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
235
 
236
  for annot in annotations_on_page:
237
  if isinstance(annot, CustomImageRecognizerResult):
238
- scale_width = scale[0]
239
- scale_height = scale[1]
240
 
241
- print("scale:", scale)
 
 
 
 
 
 
 
242
 
243
  # Calculate scaled coordinates
244
- x1 = annot.left * scale_width
245
- new_y1 = (annot.top * scale_height) # Flip Y0 (since it starts from bottom)
246
- x2 = (annot.left + annot.width) * scale_width # Calculate x1
247
- new_y2 = ((annot.top + annot.height) * scale_height) # Calculate y1 correctly
248
 
249
  rect = Rect(x1, new_y1, x2, new_y2) # Create the PyMuPDF Rect (y1, y0 are flipped)
250
 
251
  else:
 
 
 
 
 
 
 
 
252
  #print("In the pikepdf conversion function")
253
  # Extract the /Rect field
254
  rect_field = annot["/Rect"]
@@ -258,8 +275,10 @@ def redact_page_with_pymupdf(doc, annotations_on_page, page_no, scale=(1,1)):
258
 
259
  # Convert the Y-coordinates (flip using the page height)
260
  x1, y1, x2, y2 = rect_coordinates
261
- new_y1 = page_height - y2
262
- new_y2 = page_height - y1
 
 
263
 
264
  rect = Rect(x1, new_y1, x2, new_y2)
265
 
@@ -482,18 +501,18 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
482
 
483
 
484
  # Get the dimensions of the page in points with pymupdf to get relative scale
485
- page = doc.load_page(i)
486
- mu_page_rect = page.rect
487
  #mu_page_width = mu_page_rect.width
488
- mu_page_height = max(mu_page_rect.height, page.mediabox[3] - page.mediabox[1])
489
- mu_page_width = max(mu_page_rect.width, page.mediabox[2] - page.mediabox[0])
490
  #mu_page_height = mu_page_rect.height
491
 
492
  # Calculate scaling factors between PIL image and pymupdf
493
- scale_width = mu_page_width / page_width
494
- scale_height = mu_page_height / page_height
495
 
496
- scale = (scale_width, scale_height)
497
 
498
 
499
  # Possibility to use different languages
@@ -583,7 +602,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
583
 
584
  ## Apply annotations with pymupdf
585
  else:
586
- doc = redact_page_with_pymupdf(doc, merged_redaction_bboxes, i, scale)
587
 
588
  #doc.save("image_redact.pdf")
589
 
 
225
 
226
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
227
 
228
+ def redact_page_with_pymupdf(doc, annotations_on_page, page_no, image = None):#, scale=(1,1)):
229
 
230
  page = doc.load_page(page_no)
231
+ mediabox_height = page.mediabox[3] - page.mediabox[1]
232
+ mediabox_width = page.mediabox[2] - page.mediabox[0]
233
+ rect_height = page.rect.height
234
+ rect_width = page.rect.width
235
 
236
  #print("page_rect_height:", page.rect.height)
237
  #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
238
 
239
  for annot in annotations_on_page:
240
  if isinstance(annot, CustomImageRecognizerResult):
241
+ image_page_width, image_page_height = image.size
 
242
 
243
+ # Calculate scaling factors between PIL image and pymupdf
244
+ scale_width = rect_width / image_page_width
245
+ scale_height = rect_height / image_page_height
246
+
247
+ #scale_width = scale[0]
248
+ #scale_height = scale[1]
249
+
250
+ #print("scale:", scale)
251
 
252
  # Calculate scaled coordinates
253
+ x1 = (annot.left * scale_width)# + page_x_adjust
254
+ new_y1 = (annot.top * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
255
+ x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust # Calculate x1
256
+ new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust # Calculate y1 correctly
257
 
258
  rect = Rect(x1, new_y1, x2, new_y2) # Create the PyMuPDF Rect (y1, y0 are flipped)
259
 
260
  else:
261
+ # Calculate scaling factors
262
+ scale_height = rect_height / mediabox_height if mediabox_height else 1
263
+ scale_width = rect_width / mediabox_width if mediabox_width else 1
264
+
265
+ # Adjust coordinates based on scaling factors
266
+ page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
267
+ page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
268
+
269
  #print("In the pikepdf conversion function")
270
  # Extract the /Rect field
271
  rect_field = annot["/Rect"]
 
275
 
276
  # Convert the Y-coordinates (flip using the page height)
277
  x1, y1, x2, y2 = rect_coordinates
278
+ x1 = x1 + page_x_adjust
279
+ new_y1 = (rect_height - y2) - page_y_adjust
280
+ x2 = x2 + page_x_adjust
281
+ new_y2 = (rect_height - y1) - page_y_adjust
282
 
283
  rect = Rect(x1, new_y1, x2, new_y2)
284
 
 
501
 
502
 
503
  # Get the dimensions of the page in points with pymupdf to get relative scale
504
+ #page = doc.load_page(i)
505
+ #mu_page_rect = page.rect
506
  #mu_page_width = mu_page_rect.width
507
+ #mu_page_height = max(mu_page_rect.height, page.mediabox[3] - page.mediabox[1])
508
+ #mu_page_width = max(mu_page_rect.width, page.mediabox[2] - page.mediabox[0])
509
  #mu_page_height = mu_page_rect.height
510
 
511
  # Calculate scaling factors between PIL image and pymupdf
512
+ #scale_width = mu_page_width / page_width
513
+ #scale_height = mu_page_height / page_height
514
 
515
+ #scale = (scale_width, scale_height)
516
 
517
 
518
  # Possibility to use different languages
 
602
 
603
  ## Apply annotations with pymupdf
604
  else:
605
+ doc = redact_page_with_pymupdf(doc, merged_redaction_bboxes, i, image)#, scale)
606
 
607
  #doc.save("image_redact.pdf")
608