wjpoom commited on
Commit
6036340
·
verified ·
1 Parent(s): c469139

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +104 -1
README.md CHANGED
@@ -167,6 +167,7 @@ introduced in the paper [Inst-IT: Boosting Multimodal Instance Understanding via
167
 
168
  ## Quick Start
169
  **Install**
 
170
  Our code is based on LLaVA-NeXT, before running, please install the LLaVA-NeXT to prepare the environment:
171
  ```shell
172
  pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
@@ -185,10 +186,10 @@ from llava.mm_utils import (
185
  KeywordsStoppingCriteria,
186
  get_model_name_from_path,
187
  tokenizer_image_token,
 
188
  )
189
  from llava.conversation import SeparatorStyle, conv_templates
190
 
191
-
192
  overwrite_config = {}
193
  overwrite_config["mm_spatial_pool_stride"] = 2
194
  overwrite_config["mm_spatial_pool_mode"] = 'bilinear'
@@ -209,6 +210,108 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(
209
  ```
210
  **Image Inference**
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  **Video Inference**
214
 
 
167
 
168
  ## Quick Start
169
  **Install**
170
+
171
  Our code is based on LLaVA-NeXT, before running, please install the LLaVA-NeXT to prepare the environment:
172
  ```shell
173
  pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 
186
  KeywordsStoppingCriteria,
187
  get_model_name_from_path,
188
  tokenizer_image_token,
189
+ process_images
190
  )
191
  from llava.conversation import SeparatorStyle, conv_templates
192
 
 
193
  overwrite_config = {}
194
  overwrite_config["mm_spatial_pool_stride"] = 2
195
  overwrite_config["mm_spatial_pool_mode"] = 'bilinear'
 
210
  ```
211
  **Image Inference**
212
 
213
+ <details>
214
+ <summary>Inference without SoMs</summary>
215
+
216
+ Our model can perform inference on images without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT).
217
+
218
+ ```python
219
+ import torch
220
+ import requests
221
+ from PIL import Image
222
+
223
+ img_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
224
+ image = Image.open(requests.get(img_url, stream=True).raw)
225
+ image_tensor = process_images([image], image_processor, model.config).bfloat16()
226
+ image_sizes = [image.size]
227
+
228
+ question = "Describe this image."
229
+ question = DEFAULT_IMAGE_TOKEN + "\n" + question
230
+
231
+ conv_template = 'vicuna_v1'
232
+ conv = conv_templates[conv_template].copy()
233
+ conv.append_message(conv.roles[0], question)
234
+ conv.append_message(conv.roles[1], None)
235
+ prompt = conv.get_prompt()
236
+
237
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
238
+
239
+ pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
240
+ attention_masks = input_ids.ne(pad_token_ids).long().cuda()
241
+
242
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
243
+ keywords = [stop_str]
244
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
245
+
246
+ with torch.inference_mode():
247
+ output_ids = model.generate(
248
+ inputs=input_ids,
249
+ images=image_tensor,
250
+ attention_mask=attention_masks,
251
+ modalities="image",
252
+ image_sizes=image_sizes,
253
+ use_cache=True,
254
+ stopping_criteria=[stopping_criteria],
255
+ max_new_tokens=4096
256
+ )
257
+
258
+ pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
259
+ print(pred)
260
+ ```
261
+ </details>
262
+
263
+
264
+ <details>
265
+ <summary>Inference with SoMs</summary>
266
+
267
+ Our model performs even better when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided.
268
+ Compared to the previous inference code, the following code has no modifications except for the input image, which is visual prompted with Set-of-Marks.
269
+ You can refer to [this link](https://github.com/microsoft/SoM) to learn how to generate SoMs for an image.
270
+
271
+ ```python
272
+ import torch
273
+ import requests
274
+ from PIL import Image
275
+
276
+ img_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
277
+ image = Image.open(requests.get(img_url, stream=True).raw)
278
+ image_tensor = process_images([image], image_processor, model.config).bfloat16()
279
+ image_sizes = [image.size]
280
+
281
+ question = "Describe this image."
282
+ question = DEFAULT_IMAGE_TOKEN + "\n" + question
283
+
284
+ conv_template = 'vicuna_v1'
285
+ conv = conv_templates[conv_template].copy()
286
+ conv.append_message(conv.roles[0], question)
287
+ conv.append_message(conv.roles[1], None)
288
+ prompt = conv.get_prompt()
289
+
290
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
291
+
292
+ pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
293
+ attention_masks = input_ids.ne(pad_token_ids).long().cuda()
294
+
295
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
296
+ keywords = [stop_str]
297
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
298
+
299
+ with torch.inference_mode():
300
+ output_ids = model.generate(
301
+ inputs=input_ids,
302
+ images=image_tensor,
303
+ attention_mask=attention_masks,
304
+ modalities="image",
305
+ image_sizes=image_sizes,
306
+ use_cache=True,
307
+ stopping_criteria=[stopping_criteria],
308
+ max_new_tokens=4096
309
+ )
310
+
311
+ pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
312
+ print(pred)
313
+ ```
314
+ </details>
315
 
316
  **Video Inference**
317