Spaces:
Sleeping
Sleeping
fix: prediction coordinates translation
Browse files
app.py
CHANGED
@@ -18,6 +18,41 @@ model = VisionEncoderDecoderModel.from_pretrained(pretrained_repo_name, use_auth
|
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
model.to(device)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def process_refexp(image: Image, prompt: str):
|
23 |
|
@@ -89,7 +124,10 @@ def process_refexp(image: Image, prompt: str):
|
|
89 |
print(f"image width, height: {width, height}")
|
90 |
print(f"processed prompt: {prompt}")
|
91 |
|
92 |
-
#
|
|
|
|
|
|
|
93 |
x = math.floor(width*center_point["x"])
|
94 |
y = math.floor(height*center_point["y"])
|
95 |
|
|
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
model.to(device)
|
20 |
|
21 |
+
def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
|
22 |
+
"""
|
23 |
+
Convert relative prediction coordinates from resized encoder tensor image
|
24 |
+
to original input image size.
|
25 |
+
Args:
|
26 |
+
original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
|
27 |
+
input_image_size: (width, height) tuple
|
28 |
+
output_image_size: (width, height) tuple
|
29 |
+
"""
|
30 |
+
assert point is not None
|
31 |
+
assert input_image_size is not None
|
32 |
+
assert output_image_size is not None
|
33 |
+
# print(f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
|
34 |
+
input_width, input_height = input_image_size
|
35 |
+
output_width, output_height = output_image_size
|
36 |
+
|
37 |
+
ratio = min(output_width/input_width, output_height/input_height)
|
38 |
+
|
39 |
+
resized_height = int(input_height*ratio)
|
40 |
+
# print(f'>>> resized_height={resized_height}')
|
41 |
+
resized_width = int(input_width*ratio)
|
42 |
+
# print(f'>>> resized_width={resized_width}')
|
43 |
+
|
44 |
+
if resized_height == input_height and resized_width == input_width:
|
45 |
+
return
|
46 |
+
|
47 |
+
# translation of the relative positioning is only needed for dimentions that have padding
|
48 |
+
if resized_width < output_width:
|
49 |
+
# adjust for padding pixels
|
50 |
+
point['x'] *= (output_width / resized_width)
|
51 |
+
if resized_height < output_height:
|
52 |
+
# adjust for padding pixels
|
53 |
+
point['y'] *= (output_height / resized_height)
|
54 |
+
# print(f"translated point={point}, resized_image_size: {resized_width, resized_height}")
|
55 |
+
|
56 |
|
57 |
def process_refexp(image: Image, prompt: str):
|
58 |
|
|
|
124 |
print(f"image width, height: {width, height}")
|
125 |
print(f"processed prompt: {prompt}")
|
126 |
|
127 |
+
# convert coordinates from tensor image size to input image size
|
128 |
+
out_size = (output_image_size=processor.image_processor.size[1], output_image_size=processor.image_processor.size[0])
|
129 |
+
translate_point_coords_from_out_to_in(point=center_point, input_image_size=image.size, output_image_size=out_size)
|
130 |
+
|
131 |
x = math.floor(width*center_point["x"])
|
132 |
y = math.floor(height*center_point["y"])
|
133 |
|