Update README.md
Browse files
README.md
CHANGED
@@ -167,6 +167,7 @@ introduced in the paper [Inst-IT: Boosting Multimodal Instance Understanding via
|
|
167 |
|
168 |
## Quick Start
|
169 |
**Install**
|
|
|
170 |
Our code is based on LLaVA-NeXT, before running, please install the LLaVA-NeXT to prepare the environment:
|
171 |
```shell
|
172 |
pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
@@ -185,10 +186,10 @@ from llava.mm_utils import (
|
|
185 |
KeywordsStoppingCriteria,
|
186 |
get_model_name_from_path,
|
187 |
tokenizer_image_token,
|
|
|
188 |
)
|
189 |
from llava.conversation import SeparatorStyle, conv_templates
|
190 |
|
191 |
-
|
192 |
overwrite_config = {}
|
193 |
overwrite_config["mm_spatial_pool_stride"] = 2
|
194 |
overwrite_config["mm_spatial_pool_mode"] = 'bilinear'
|
@@ -209,6 +210,108 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(
|
|
209 |
```
|
210 |
**Image Inference**
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
**Video Inference**
|
214 |
|
|
|
167 |
|
168 |
## Quick Start
|
169 |
**Install**
|
170 |
+
|
171 |
Our code is based on LLaVA-NeXT, before running, please install the LLaVA-NeXT to prepare the environment:
|
172 |
```shell
|
173 |
pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
|
|
186 |
KeywordsStoppingCriteria,
|
187 |
get_model_name_from_path,
|
188 |
tokenizer_image_token,
|
189 |
+
process_images
|
190 |
)
|
191 |
from llava.conversation import SeparatorStyle, conv_templates
|
192 |
|
|
|
193 |
overwrite_config = {}
|
194 |
overwrite_config["mm_spatial_pool_stride"] = 2
|
195 |
overwrite_config["mm_spatial_pool_mode"] = 'bilinear'
|
|
|
210 |
```
|
211 |
**Image Inference**
|
212 |
|
213 |
+
<details>
|
214 |
+
<summary>Inference without SoMs</summary>
|
215 |
+
|
216 |
+
Our model can perform inference on images without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT).
|
217 |
+
|
218 |
+
```python
|
219 |
+
import torch
|
220 |
+
import requests
|
221 |
+
from PIL import Image
|
222 |
+
|
223 |
+
img_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
224 |
+
image = Image.open(requests.get(img_url, stream=True).raw)
|
225 |
+
image_tensor = process_images([image], image_processor, model.config).bfloat16()
|
226 |
+
image_sizes = [image.size]
|
227 |
+
|
228 |
+
question = "Describe this image."
|
229 |
+
question = DEFAULT_IMAGE_TOKEN + "\n" + question
|
230 |
+
|
231 |
+
conv_template = 'vicuna_v1'
|
232 |
+
conv = conv_templates[conv_template].copy()
|
233 |
+
conv.append_message(conv.roles[0], question)
|
234 |
+
conv.append_message(conv.roles[1], None)
|
235 |
+
prompt = conv.get_prompt()
|
236 |
+
|
237 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
|
238 |
+
|
239 |
+
pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
|
240 |
+
attention_masks = input_ids.ne(pad_token_ids).long().cuda()
|
241 |
+
|
242 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
243 |
+
keywords = [stop_str]
|
244 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
245 |
+
|
246 |
+
with torch.inference_mode():
|
247 |
+
output_ids = model.generate(
|
248 |
+
inputs=input_ids,
|
249 |
+
images=image_tensor,
|
250 |
+
attention_mask=attention_masks,
|
251 |
+
modalities="image",
|
252 |
+
image_sizes=image_sizes,
|
253 |
+
use_cache=True,
|
254 |
+
stopping_criteria=[stopping_criteria],
|
255 |
+
max_new_tokens=4096
|
256 |
+
)
|
257 |
+
|
258 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
259 |
+
print(pred)
|
260 |
+
```
|
261 |
+
</details>
|
262 |
+
|
263 |
+
|
264 |
+
<details>
|
265 |
+
<summary>Inference with SoMs</summary>
|
266 |
+
|
267 |
+
Our model performs even better when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided.
|
268 |
+
Compared to the previous inference code, the following code has no modifications except for the input image, which is visual prompted with Set-of-Marks.
|
269 |
+
You can refer to [this link](https://github.com/microsoft/SoM) to learn how to generate SoMs for an image.
|
270 |
+
|
271 |
+
```python
|
272 |
+
import torch
|
273 |
+
import requests
|
274 |
+
from PIL import Image
|
275 |
+
|
276 |
+
img_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
277 |
+
image = Image.open(requests.get(img_url, stream=True).raw)
|
278 |
+
image_tensor = process_images([image], image_processor, model.config).bfloat16()
|
279 |
+
image_sizes = [image.size]
|
280 |
+
|
281 |
+
question = "Describe this image."
|
282 |
+
question = DEFAULT_IMAGE_TOKEN + "\n" + question
|
283 |
+
|
284 |
+
conv_template = 'vicuna_v1'
|
285 |
+
conv = conv_templates[conv_template].copy()
|
286 |
+
conv.append_message(conv.roles[0], question)
|
287 |
+
conv.append_message(conv.roles[1], None)
|
288 |
+
prompt = conv.get_prompt()
|
289 |
+
|
290 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
|
291 |
+
|
292 |
+
pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
|
293 |
+
attention_masks = input_ids.ne(pad_token_ids).long().cuda()
|
294 |
+
|
295 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
296 |
+
keywords = [stop_str]
|
297 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
298 |
+
|
299 |
+
with torch.inference_mode():
|
300 |
+
output_ids = model.generate(
|
301 |
+
inputs=input_ids,
|
302 |
+
images=image_tensor,
|
303 |
+
attention_mask=attention_masks,
|
304 |
+
modalities="image",
|
305 |
+
image_sizes=image_sizes,
|
306 |
+
use_cache=True,
|
307 |
+
stopping_criteria=[stopping_criteria],
|
308 |
+
max_new_tokens=4096
|
309 |
+
)
|
310 |
+
|
311 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
312 |
+
print(pred)
|
313 |
+
```
|
314 |
+
</details>
|
315 |
|
316 |
**Video Inference**
|
317 |
|