ui-refexp-click

Sleeping

App Files Files Community

ivelin commited on Feb 7, 2023

Commit

7643365

1 Parent(s): e4c073a

fix: prediction coordinates translation

Browse files

Files changed (1) hide show

app.py +39 -1

app.py CHANGED Viewed

@@ -18,6 +18,41 @@ model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def process_refexp(image: Image, prompt: str):
@@ -89,7 +124,10 @@ def process_refexp(image: Image, prompt: str):
     print(f"image width, height: {width, height}")
     print(f"processed prompt: {prompt}")
-    # safeguard in case text prediction is missing some center point coordinates
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])

 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
+    """
+    Convert relative prediction coordinates from resized encoder tensor image
+    to original input image size.
+    Args:
+        original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
+        input_image_size: (width, height) tuple
+        output_image_size: (width, height) tuple
+    """
+    assert point is not None
+    assert input_image_size is not None
+    assert output_image_size is not None
+    # print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
+    input_width, input_height = input_image_size
+    output_width, output_height = output_image_size
+    ratio = min(output_width/input_width, output_height/input_height)
+    resized_height = int(input_height*ratio)
+    # print(f'>>> resized_height={resized_height}')
+    resized_width = int(input_width*ratio)
+    # print(f'>>> resized_width={resized_width}')
+    if resized_height == input_height and resized_width == input_width:
+        return
+    # translation of the relative positioning is only needed for dimentions that have padding
+    if resized_width < output_width:
+        # adjust for padding pixels
+        point['x'] *= (output_width / resized_width)
+    if resized_height < output_height:
+        # adjust for padding pixels
+        point['y'] *= (output_height / resized_height)
+    # print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
 def process_refexp(image: Image, prompt: str):
     print(f"image width, height: {width, height}")
     print(f"processed prompt: {prompt}")
+    # convert coordinates from tensor image size to input image size
+    out_size = (output_image_size=processor.image_processor.size[1], output_image_size=processor.image_processor.size[0])
+    translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
     x = math.floor(width*center_point["x"])
     y = math.floor(height*center_point["y"])