alvarobartt HF staff commited on
Commit
53c3843
·
verified ·
1 Parent(s): b5731cf

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +189 -0
  2. requirements.txt +2 -0
handler.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://huggingface.co/nvidia/NVLM-D-72B#inference
2
+
3
+ import math
4
+ from typing import Any, Dict, List
5
+
6
+ import torch
7
+ import torchvision.transforms as T
8
+ from torchvision.transforms.functional import InterpolationMode
9
+
10
+ import requests
11
+ from io import BytesIO
12
+ from PIL import Image
13
+
14
+ from transformers import AutoTokenizer, AutoModel
15
+
16
+ from huggingface_inference_toolkit.logging import logger
17
+
18
+
19
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
20
+ best_ratio_diff = float("inf")
21
+ best_ratio = (1, 1)
22
+ area = width * height
23
+ for ratio in target_ratios:
24
+ target_aspect_ratio = ratio[0] / ratio[1]
25
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
26
+ if ratio_diff < best_ratio_diff:
27
+ best_ratio_diff = ratio_diff
28
+ best_ratio = ratio
29
+ elif ratio_diff == best_ratio_diff:
30
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
31
+ best_ratio = ratio
32
+ return best_ratio
33
+
34
+
35
+ def dynamic_preprocess(
36
+ image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
37
+ ):
38
+ orig_width, orig_height = image.size
39
+ aspect_ratio = orig_width / orig_height
40
+
41
+ # calculate the existing image aspect ratio
42
+ target_ratios = set(
43
+ (i, j)
44
+ for n in range(min_num, max_num + 1)
45
+ for i in range(1, n + 1)
46
+ for j in range(1, n + 1)
47
+ if i * j <= max_num and i * j >= min_num
48
+ )
49
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
50
+
51
+ # find the closest aspect ratio to the target
52
+ target_aspect_ratio = find_closest_aspect_ratio(
53
+ aspect_ratio,
54
+ target_ratios,
55
+ orig_width,
56
+ orig_height,
57
+ image_size,
58
+ )
59
+
60
+ # calculate the target width and height
61
+ target_width = image_size * target_aspect_ratio[0]
62
+ target_height = image_size * target_aspect_ratio[1]
63
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
64
+
65
+ # resize the image
66
+ resized_img = image.resize((target_width, target_height))
67
+ processed_images = []
68
+ for i in range(blocks):
69
+ box = (
70
+ (i % (target_width // image_size)) * image_size,
71
+ (i // (target_width // image_size)) * image_size,
72
+ ((i % (target_width // image_size)) + 1) * image_size,
73
+ ((i // (target_width // image_size)) + 1) * image_size,
74
+ )
75
+ # split the image
76
+ split_img = resized_img.crop(box)
77
+ processed_images.append(split_img)
78
+ assert len(processed_images) == blocks
79
+ if use_thumbnail and len(processed_images) != 1:
80
+ thumbnail_img = image.resize((image_size, image_size))
81
+ processed_images.append(thumbnail_img)
82
+ return processed_images
83
+
84
+
85
+ def load_image(image_url, input_size=448, max_num=12):
86
+ response = requests.get(image_url)
87
+ image = Image.open(BytesIO(response.content)).convert("RGB")
88
+ transform = build_transform(input_size=input_size)
89
+ images = dynamic_preprocess(
90
+ image, image_size=input_size, use_thumbnail=True, max_num=max_num
91
+ )
92
+ pixel_values = [transform(image) for image in images]
93
+ pixel_values = torch.stack(pixel_values)
94
+ return pixel_values
95
+
96
+
97
+ def split_model():
98
+ device_map = {}
99
+ world_size = torch.cuda.device_count()
100
+ num_layers = 80
101
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
102
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
103
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
104
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
105
+ layer_cnt = 0
106
+ for i, num_layer in enumerate(num_layers_per_gpu):
107
+ for j in range(num_layer):
108
+ device_map[f"language_model.model.layers.{layer_cnt}"] = i
109
+ layer_cnt += 1
110
+ device_map["vision_model"] = 0
111
+ device_map["mlp1"] = 0
112
+ device_map["language_model.model.tok_embeddings"] = 0
113
+ device_map["language_model.model.embed_tokens"] = 0
114
+ device_map["language_model.output"] = 0
115
+ device_map["language_model.model.norm"] = 0
116
+ device_map["language_model.lm_head"] = 0
117
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
118
+
119
+ return device_map
120
+
121
+
122
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
123
+ IMAGENET_STD = (0.229, 0.224, 0.225)
124
+
125
+
126
+ def build_transform(input_size):
127
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
128
+ transform = T.Compose(
129
+ [
130
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
131
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
132
+ T.ToTensor(),
133
+ T.Normalize(mean=MEAN, std=STD),
134
+ ]
135
+ )
136
+ return transform
137
+
138
+
139
+ class EndpointHandler:
140
+ def __init__(self, model_dir: str, **kwargs: Any) -> None:
141
+ self.model = AutoModel.from_pretrained(
142
+ model_dir,
143
+ torch_dtype=torch.bfloat16,
144
+ low_cpu_mem_usage=True,
145
+ use_flash_attn=False,
146
+ trust_remote_code=True,
147
+ device_map=split_model(),
148
+ ).eval()
149
+
150
+ self.tokenizer = AutoTokenizer.from_pretrained(
151
+ model_dir, trust_remote_code=True, use_fast=False
152
+ )
153
+
154
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, List[Any]]:
155
+ if "instances" not in data:
156
+ raise ValueError(
157
+ "The request body must contain a key 'instances' with a list of instances."
158
+ )
159
+
160
+ logger.debug(f"Received incoming request with {data=}")
161
+
162
+ predictions = []
163
+ for input in data["instances"]:
164
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
165
+
166
+ if "image_url" not in input:
167
+ # pure-text conversation
168
+ response, history = self.model.chat(
169
+ self.tokenizer,
170
+ None,
171
+ input["prompt"],
172
+ generation_config,
173
+ history=None,
174
+ return_history=True,
175
+ )
176
+ else:
177
+ # single-image single-round conversation
178
+ pixel_values = load_image(input["image_url"], max_num=6).to(
179
+ torch.bfloat16
180
+ )
181
+ response = self.model.chat(
182
+ self.tokenizer,
183
+ pixel_values,
184
+ f"<image>\n{input['prompt']}",
185
+ generation_config,
186
+ )
187
+
188
+ predictions.append(response)
189
+ return {"predictions": predictions}
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ einops
2
+ timm