Update README.md
Browse files
README.md
CHANGED
@@ -1,142 +1,142 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
---
|
4 |
-
# LLaVE-2B
|
5 |
-
|
6 |
-
## Model Summary
|
7 |
-
|
8 |
-
The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
|
9 |
-
|
10 |
-
- **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
|
11 |
-
- **Paper:** [LLaVE](arxiv.org/)
|
12 |
-
|
13 |
-
## Train/Eval Data
|
14 |
-
- Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
|
15 |
-
- Eval data: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval
|
16 |
-
|
17 |
-
## Use
|
18 |
-
|
19 |
-
### Intended use
|
20 |
-
|
21 |
-
The model have the ability to embed with images, multi-image and videos.
|
22 |
-
|
23 |
-
### Quick Start
|
24 |
-
|
25 |
-
First clone our github
|
26 |
-
```bash
|
27 |
-
git clone https://github.com/DeepLearnXMU/LLaVE
|
28 |
-
cd LLaVE
|
29 |
-
pip install -e ".[train]"
|
30 |
-
```
|
31 |
-
|
32 |
-
We provide the simple embedding process for using our model. For more details, you could refer to [Github](https://github.com/DeepLearnXMU/LLaVE).
|
33 |
-
|
34 |
-
```python
|
35 |
-
# pip install git+https://github.com/DeepLearnXMU/LLaVE
|
36 |
-
|
37 |
-
|
38 |
-
import torch
|
39 |
-
import copy
|
40 |
-
from PIL import Image
|
41 |
-
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
42 |
-
from llava.conversation import conv_templates
|
43 |
-
from llava.model.builder import load_pretrained_model
|
44 |
-
from llava.mm_utils import tokenizer_image_token, process_images
|
45 |
-
|
46 |
-
pretrained = "zhibinlan/LLaVE-2B"
|
47 |
-
model_name = "llava_qwen"
|
48 |
-
device = "cuda"
|
49 |
-
device_map = "auto"
|
50 |
-
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
|
51 |
-
model.eval()
|
52 |
-
|
53 |
-
# Image + Text -> Text
|
54 |
-
image = Image.open("figures/example.jpg")
|
55 |
-
image_tensor = process_images([image], image_processor, model.config)
|
56 |
-
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
|
57 |
-
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
58 |
-
|
59 |
-
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
60 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
61 |
-
conv.append_message(conv.roles[0], question)
|
62 |
-
conv.append_message(conv.roles[1], "\n")
|
63 |
-
prompt_question = conv.get_prompt()
|
64 |
-
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
65 |
-
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
66 |
-
image_sizes = [image.size]
|
67 |
-
query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=image_sizes)
|
68 |
-
|
69 |
-
target_string = "A cat and a dog"
|
70 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
71 |
-
conv.append_message(conv.roles[0], target_string)
|
72 |
-
conv.append_message(conv.roles[1], "\n")
|
73 |
-
target_string = conv.get_prompt()
|
74 |
-
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
75 |
-
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
76 |
-
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask)
|
77 |
-
|
78 |
-
print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
|
79 |
-
# 2B: A cat and a dog similarity score: tensor([[0.5132]]
|
80 |
-
|
81 |
-
neg_string = "A cat and a tiger"
|
82 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
83 |
-
conv.append_message(conv.roles[0], neg_string)
|
84 |
-
conv.append_message(conv.roles[1], "\n")
|
85 |
-
neg_string = conv.get_prompt()
|
86 |
-
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
87 |
-
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
88 |
-
neg_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
|
89 |
-
print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
|
90 |
-
# 2B: A cat and a tiger similarity score: tensor([[0.3809]]
|
91 |
-
|
92 |
-
|
93 |
-
# Text -> Image
|
94 |
-
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
95 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
96 |
-
conv.append_message(conv.roles[0], pos_string)
|
97 |
-
conv.append_message(conv.roles[1], "\n")
|
98 |
-
pos_string = conv.get_prompt()
|
99 |
-
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
100 |
-
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
101 |
-
pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_mask=attention_mask)
|
102 |
-
|
103 |
-
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
104 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
105 |
-
conv.append_message(conv.roles[0], target)
|
106 |
-
conv.append_message(conv.roles[1], "\n")
|
107 |
-
prompt_target = conv.get_prompt()
|
108 |
-
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
109 |
-
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
110 |
-
target_image_sizes = [image.size]
|
111 |
-
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=target_image_sizes)
|
112 |
-
|
113 |
-
print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed.T)
|
114 |
-
# 2B: A cat and a dog similarity score: tensor([[0.5225]]
|
115 |
-
|
116 |
-
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
117 |
-
conv = copy.deepcopy(conv_templates[conv_template])
|
118 |
-
conv.append_message(conv.roles[0], neg_string)
|
119 |
-
conv.append_message(conv.roles[1], "\n")
|
120 |
-
neg_string = conv.get_prompt()
|
121 |
-
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
122 |
-
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
123 |
-
neg_query_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
|
124 |
-
|
125 |
-
print("A cat and a tiger image similarity score: ", neg_query_embed @ target_embed.T)
|
126 |
-
# 2B: A cat and a dog similarity score: tensor([[0.4141]]
|
127 |
-
```
|
128 |
-
|
129 |
-
## Hardware & Software
|
130 |
-
- **GPUs:** 8 * Nvidia A100 (40G) (for whole model training)
|
131 |
-
- **Orchestration:** [Huggingface Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)
|
132 |
-
- **Neural networks:** [PyTorch](https://github.com/pytorch/pytorch)
|
133 |
-
|
134 |
-
## Citation
|
135 |
-
```
|
136 |
-
@article{lan2025llave,
|
137 |
-
title={
|
138 |
-
author={Lan, Zhibin and Niu, Liqiang and Meng, Fandong and Zhou, Jie and Su, Jinsong},
|
139 |
-
journal={arXiv preprint},
|
140 |
-
year={2025}
|
141 |
-
}
|
142 |
```
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
4 |
+
# LLaVE-2B
|
5 |
+
|
6 |
+
## Model Summary
|
7 |
+
|
8 |
+
The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
|
9 |
+
|
10 |
+
- **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
|
11 |
+
- **Paper:** [LLaVE](https://arxiv.org/abs/2503.04812)
|
12 |
+
|
13 |
+
## Train/Eval Data
|
14 |
+
- Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
|
15 |
+
- Eval data: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval
|
16 |
+
|
17 |
+
## Use
|
18 |
+
|
19 |
+
### Intended use
|
20 |
+
|
21 |
+
The model have the ability to embed with images, multi-image and videos.
|
22 |
+
|
23 |
+
### Quick Start
|
24 |
+
|
25 |
+
First clone our github
|
26 |
+
```bash
|
27 |
+
git clone https://github.com/DeepLearnXMU/LLaVE
|
28 |
+
cd LLaVE
|
29 |
+
pip install -e ".[train]"
|
30 |
+
```
|
31 |
+
|
32 |
+
We provide the simple embedding process for using our model. For more details, you could refer to [Github](https://github.com/DeepLearnXMU/LLaVE).
|
33 |
+
|
34 |
+
```python
|
35 |
+
# pip install git+https://github.com/DeepLearnXMU/LLaVE
|
36 |
+
|
37 |
+
|
38 |
+
import torch
|
39 |
+
import copy
|
40 |
+
from PIL import Image
|
41 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
42 |
+
from llava.conversation import conv_templates
|
43 |
+
from llava.model.builder import load_pretrained_model
|
44 |
+
from llava.mm_utils import tokenizer_image_token, process_images
|
45 |
+
|
46 |
+
pretrained = "zhibinlan/LLaVE-2B"
|
47 |
+
model_name = "llava_qwen"
|
48 |
+
device = "cuda"
|
49 |
+
device_map = "auto"
|
50 |
+
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
|
51 |
+
model.eval()
|
52 |
+
|
53 |
+
# Image + Text -> Text
|
54 |
+
image = Image.open("figures/example.jpg")
|
55 |
+
image_tensor = process_images([image], image_processor, model.config)
|
56 |
+
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
|
57 |
+
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
58 |
+
|
59 |
+
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
60 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
61 |
+
conv.append_message(conv.roles[0], question)
|
62 |
+
conv.append_message(conv.roles[1], "\n")
|
63 |
+
prompt_question = conv.get_prompt()
|
64 |
+
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
65 |
+
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
66 |
+
image_sizes = [image.size]
|
67 |
+
query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=image_sizes)
|
68 |
+
|
69 |
+
target_string = "A cat and a dog"
|
70 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
71 |
+
conv.append_message(conv.roles[0], target_string)
|
72 |
+
conv.append_message(conv.roles[1], "\n")
|
73 |
+
target_string = conv.get_prompt()
|
74 |
+
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
75 |
+
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
76 |
+
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask)
|
77 |
+
|
78 |
+
print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
|
79 |
+
# 2B: A cat and a dog similarity score: tensor([[0.5132]]
|
80 |
+
|
81 |
+
neg_string = "A cat and a tiger"
|
82 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
83 |
+
conv.append_message(conv.roles[0], neg_string)
|
84 |
+
conv.append_message(conv.roles[1], "\n")
|
85 |
+
neg_string = conv.get_prompt()
|
86 |
+
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
87 |
+
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
88 |
+
neg_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
|
89 |
+
print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
|
90 |
+
# 2B: A cat and a tiger similarity score: tensor([[0.3809]]
|
91 |
+
|
92 |
+
|
93 |
+
# Text -> Image
|
94 |
+
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
95 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
96 |
+
conv.append_message(conv.roles[0], pos_string)
|
97 |
+
conv.append_message(conv.roles[1], "\n")
|
98 |
+
pos_string = conv.get_prompt()
|
99 |
+
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
100 |
+
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
101 |
+
pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_mask=attention_mask)
|
102 |
+
|
103 |
+
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
104 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
105 |
+
conv.append_message(conv.roles[0], target)
|
106 |
+
conv.append_message(conv.roles[1], "\n")
|
107 |
+
prompt_target = conv.get_prompt()
|
108 |
+
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
109 |
+
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
110 |
+
target_image_sizes = [image.size]
|
111 |
+
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=target_image_sizes)
|
112 |
+
|
113 |
+
print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed.T)
|
114 |
+
# 2B: A cat and a dog similarity score: tensor([[0.5225]]
|
115 |
+
|
116 |
+
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
117 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
118 |
+
conv.append_message(conv.roles[0], neg_string)
|
119 |
+
conv.append_message(conv.roles[1], "\n")
|
120 |
+
neg_string = conv.get_prompt()
|
121 |
+
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
122 |
+
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
123 |
+
neg_query_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
|
124 |
+
|
125 |
+
print("A cat and a tiger image similarity score: ", neg_query_embed @ target_embed.T)
|
126 |
+
# 2B: A cat and a dog similarity score: tensor([[0.4141]]
|
127 |
+
```
|
128 |
+
|
129 |
+
## Hardware & Software
|
130 |
+
- **GPUs:** 8 * Nvidia A100 (40G) (for whole model training)
|
131 |
+
- **Orchestration:** [Huggingface Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)
|
132 |
+
- **Neural networks:** [PyTorch](https://github.com/pytorch/pytorch)
|
133 |
+
|
134 |
+
## Citation
|
135 |
+
```
|
136 |
+
@article{lan2025llave,
|
137 |
+
title={LLaVE: Large Language and Vision Embedding Models with Hardness-Weighted Contrastive Learning},
|
138 |
+
author={Lan, Zhibin and Niu, Liqiang and Meng, Fandong and Zhou, Jie and Su, Jinsong},
|
139 |
+
journal={arXiv preprint arXiv:2503.04812},
|
140 |
+
year={2025}
|
141 |
+
}
|
142 |
```
|