Update README.md
Browse files
README.md
CHANGED
@@ -216,10 +216,11 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(
|
|
216 |
```
|
217 |
**Image Inference**
|
218 |
|
219 |
-
Our model can perform inference on images without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT).
|
220 |
<details>
|
221 |
<summary>Inference without SoMs</summary>
|
222 |
-
|
|
|
|
|
223 |
```python
|
224 |
import torch
|
225 |
import requests
|
@@ -265,12 +266,13 @@ print(pred)
|
|
265 |
```
|
266 |
</details>
|
267 |
|
|
|
|
|
|
|
268 |
Our model performs more fine-grained understanding when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided.
|
269 |
You can refer to the instances that you are interested in using their IDs.
|
270 |
Compared to the previous inference code, the following code has no modifications except for the input image, which is visual prompted with Set-of-Marks.
|
271 |
Refer to [this link](https://github.com/microsoft/SoM) to learn how to generate SoMs for an image.
|
272 |
-
<details>
|
273 |
-
<summary>Inference with SoMs</summary>
|
274 |
|
275 |
```python
|
276 |
import torch
|
@@ -320,6 +322,130 @@ print(pred)
|
|
320 |
|
321 |
**Video Inference**
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
## Contact
|
325 |
Feel free to contact us if you have any questions or suggestions
|
|
|
216 |
```
|
217 |
**Image Inference**
|
218 |
|
|
|
219 |
<details>
|
220 |
<summary>Inference without SoMs</summary>
|
221 |
+
|
222 |
+
Our model can perform inference on images without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT).
|
223 |
+
|
224 |
```python
|
225 |
import torch
|
226 |
import requests
|
|
|
266 |
```
|
267 |
</details>
|
268 |
|
269 |
+
<details>
|
270 |
+
<summary>Inference with SoMs</summary>
|
271 |
+
|
272 |
Our model performs more fine-grained understanding when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided.
|
273 |
You can refer to the instances that you are interested in using their IDs.
|
274 |
Compared to the previous inference code, the following code has no modifications except for the input image, which is visual prompted with Set-of-Marks.
|
275 |
Refer to [this link](https://github.com/microsoft/SoM) to learn how to generate SoMs for an image.
|
|
|
|
|
276 |
|
277 |
```python
|
278 |
import torch
|
|
|
322 |
|
323 |
**Video Inference**
|
324 |
|
325 |
+
For the video, we organize each frame into a list. You can use the format \<t\> to refer to a specific timestamp (e.g. <1>).
|
326 |
+
|
327 |
+
<details>
|
328 |
+
<summary>Inference without SoMs</summary>
|
329 |
+
|
330 |
+
Our model can perform inference on videos without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT).
|
331 |
+
|
332 |
+
```python
|
333 |
+
import torch
|
334 |
+
import requests
|
335 |
+
from PIL import Image
|
336 |
+
|
337 |
+
frame_urls = [
|
338 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_1.jpg?raw=true",
|
339 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_2.jpg?raw=true",
|
340 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_3.jpg?raw=true",
|
341 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_4.jpg?raw=true",
|
342 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_5.jpg?raw=true",
|
343 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_6.jpg?raw=true",
|
344 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_7.jpg?raw=true",
|
345 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_8.jpg?raw=true"
|
346 |
+
]
|
347 |
+
video = [Image.open(requests.get(frame_url, stream=True).raw) for frame_url in frame_urls]
|
348 |
+
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda()
|
349 |
+
video = video.bfloat16()
|
350 |
+
videos = [video]
|
351 |
+
|
352 |
+
question = "Describe the video." # overall video caption
|
353 |
+
question = "What happens at frame <1>?" # caption a specific moment
|
354 |
+
question = DEFAULT_IMAGE_TOKEN + "\n" + question
|
355 |
+
|
356 |
+
conv_template = 'vicuna_v1'
|
357 |
+
conv = conv_templates[conv_template].copy()
|
358 |
+
conv.append_message(conv.roles[0], question)
|
359 |
+
conv.append_message(conv.roles[1], None)
|
360 |
+
prompt = conv.get_prompt()
|
361 |
+
|
362 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
|
363 |
+
|
364 |
+
pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
|
365 |
+
attention_masks = input_ids.ne(pad_token_ids).long().cuda()
|
366 |
+
|
367 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
368 |
+
keywords = [stop_str]
|
369 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
370 |
+
|
371 |
+
with torch.inference_mode():
|
372 |
+
output_ids = model.generate(
|
373 |
+
inputs=input_ids,
|
374 |
+
images=videos,
|
375 |
+
attention_mask=attention_masks,
|
376 |
+
modalities="video",
|
377 |
+
use_cache=True,
|
378 |
+
stopping_criteria=[stopping_criteria],
|
379 |
+
max_new_tokens=4096
|
380 |
+
)
|
381 |
+
|
382 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
383 |
+
print(pred)
|
384 |
+
```
|
385 |
+
</details>
|
386 |
+
|
387 |
+
<details>
|
388 |
+
<summary>Inference without SoMs</summary>
|
389 |
+
|
390 |
+
Our model performs more fine-grained understanding when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided.
|
391 |
+
You can refer to the instances that you are interested in using their IDs.
|
392 |
+
Compared to the previous inference code, the following code has no modifications except for the input video, which is visual prompted with Set-of-Marks.
|
393 |
+
Refer to [SAM2](https://github.com/facebookresearch/sam2) and [SoM](https://github.com/microsoft/SoM) to learn how to generate SoMs for a video.
|
394 |
+
|
395 |
+
```python
|
396 |
+
import torch
|
397 |
+
import requests
|
398 |
+
from PIL import Image
|
399 |
+
|
400 |
+
frame_urls = [
|
401 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_1.jpg?raw=true",
|
402 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_2.jpg?raw=true",
|
403 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_3.jpg?raw=true",
|
404 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_4.jpg?raw=true",
|
405 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_5.jpg?raw=true",
|
406 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_6.jpg?raw=true",
|
407 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_7.jpg?raw=true",
|
408 |
+
"https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_8.jpg?raw=true"
|
409 |
+
]
|
410 |
+
video = [Image.open(requests.get(frame_url, stream=True).raw) for frame_url in frame_urls]
|
411 |
+
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda()
|
412 |
+
video = video.bfloat16()
|
413 |
+
videos = [video]
|
414 |
+
|
415 |
+
# You can use [id] to refer to the instances that you are interested in
|
416 |
+
question = "Is [3] visible at <1>?"
|
417 |
+
question = DEFAULT_IMAGE_TOKEN + "\n" + question
|
418 |
+
|
419 |
+
conv_template = 'vicuna_v1'
|
420 |
+
conv = conv_templates[conv_template].copy()
|
421 |
+
conv.append_message(conv.roles[0], question)
|
422 |
+
conv.append_message(conv.roles[1], None)
|
423 |
+
prompt = conv.get_prompt()
|
424 |
+
|
425 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
|
426 |
+
|
427 |
+
pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
|
428 |
+
attention_masks = input_ids.ne(pad_token_ids).long().cuda()
|
429 |
+
|
430 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
431 |
+
keywords = [stop_str]
|
432 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
433 |
+
|
434 |
+
with torch.inference_mode():
|
435 |
+
output_ids = model.generate(
|
436 |
+
inputs=input_ids,
|
437 |
+
images=videos,
|
438 |
+
attention_mask=attention_masks,
|
439 |
+
modalities="video",
|
440 |
+
use_cache=True,
|
441 |
+
stopping_criteria=[stopping_criteria],
|
442 |
+
max_new_tokens=4096
|
443 |
+
)
|
444 |
+
|
445 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
446 |
+
print(pred)
|
447 |
+
```
|
448 |
+
</details>
|
449 |
|
450 |
## Contact
|
451 |
Feel free to contact us if you have any questions or suggestions
|