ivelin commited on
Commit
7643365
·
1 Parent(s): e4c073a

fix: prediction coordinates translation

Browse files
Files changed (1) hide show
  1. app.py +39 -1
app.py CHANGED
@@ -18,6 +18,41 @@ model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  model.to(device)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def process_refexp(image: Image, prompt: str):
23
 
@@ -89,7 +124,10 @@ def process_refexp(image: Image, prompt: str):
89
  print(f"image width, height: {width, height}")
90
  print(f"processed prompt: {prompt}")
91
 
92
- # safeguard in case text prediction is missing some center point coordinates
 
 
 
93
  x = math.floor(width*center_point["x"])
94
  y = math.floor(height*center_point["y"])
95
 
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  model.to(device)
20
 
21
+ def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
22
+ """
23
+ Convert relative prediction coordinates from resized encoder tensor image
24
+ to original input image size.
25
+ Args:
26
+ original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
27
+ input_image_size: (width, height) tuple
28
+ output_image_size: (width, height) tuple
29
+ """
30
+ assert point is not None
31
+ assert input_image_size is not None
32
+ assert output_image_size is not None
33
+ # print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
34
+ input_width, input_height = input_image_size
35
+ output_width, output_height = output_image_size
36
+
37
+ ratio = min(output_width/input_width, output_height/input_height)
38
+
39
+ resized_height = int(input_height*ratio)
40
+ # print(f'>>> resized_height={resized_height}')
41
+ resized_width = int(input_width*ratio)
42
+ # print(f'>>> resized_width={resized_width}')
43
+
44
+ if resized_height == input_height and resized_width == input_width:
45
+ return
46
+
47
+ # translation of the relative positioning is only needed for dimentions that have padding
48
+ if resized_width < output_width:
49
+ # adjust for padding pixels
50
+ point['x'] *= (output_width / resized_width)
51
+ if resized_height < output_height:
52
+ # adjust for padding pixels
53
+ point['y'] *= (output_height / resized_height)
54
+ # print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
55
+
56
 
57
  def process_refexp(image: Image, prompt: str):
58
 
 
124
  print(f"image width, height: {width, height}")
125
  print(f"processed prompt: {prompt}")
126
 
127
+ # convert coordinates from tensor image size to input image size
128
+ out_size = (output_image_size=processor.image_processor.size[1], output_image_size=processor.image_processor.size[0])
129
+ translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
130
+
131
  x = math.floor(width*center_point["x"])
132
  y = math.floor(height*center_point["y"])
133