Improve model card: Add link to paper, code, correct pipeline tag, reorder metadata
#1
by
nielsr
HF staff
- opened
README.md
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
---
|
2 |
-
|
3 |
library_name: transformers
|
|
|
4 |
language:
|
5 |
- en
|
6 |
tags:
|
7 |
- Sentence Similarity
|
8 |
- Embedding
|
9 |
-
- zero-shot-image-classification
|
10 |
-
- video-text-to-text
|
11 |
-
pipeline_tag: image-text-to-text
|
12 |
---
|
|
|
13 |
# LLaVE-2B
|
14 |
|
15 |
## Model Summary
|
@@ -17,7 +18,7 @@ pipeline_tag: image-text-to-text
|
|
17 |
The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
|
18 |
|
19 |
- **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
|
20 |
-
- **Paper:** [LLaVE](https://
|
21 |
|
22 |
## Train/Eval Data
|
23 |
- Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
|
@@ -81,7 +82,8 @@ conv_template = "qwen_1_5" # Make sure you use correct chat template for differ
|
|
81 |
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
82 |
conv = copy.deepcopy(conv_templates[conv_template])
|
83 |
conv.append_message(conv.roles[0], question)
|
84 |
-
conv.append_message(conv.roles[1], "
|
|
|
85 |
prompt_question = conv.get_prompt()
|
86 |
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
87 |
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
@@ -91,7 +93,8 @@ query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=atten
|
|
91 |
target_string = "A cat and a dog"
|
92 |
conv = copy.deepcopy(conv_templates[conv_template])
|
93 |
conv.append_message(conv.roles[0], target_string)
|
94 |
-
conv.append_message(conv.roles[1], "
|
|
|
95 |
target_string = conv.get_prompt()
|
96 |
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
97 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
@@ -103,7 +106,8 @@ print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
|
|
103 |
neg_string = "A cat and a tiger"
|
104 |
conv = copy.deepcopy(conv_templates[conv_template])
|
105 |
conv.append_message(conv.roles[0], neg_string)
|
106 |
-
conv.append_message(conv.roles[1], "
|
|
|
107 |
neg_string = conv.get_prompt()
|
108 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
109 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
@@ -116,7 +120,8 @@ print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
|
|
116 |
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
117 |
conv = copy.deepcopy(conv_templates[conv_template])
|
118 |
conv.append_message(conv.roles[0], pos_string)
|
119 |
-
conv.append_message(conv.roles[1], "
|
|
|
120 |
pos_string = conv.get_prompt()
|
121 |
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
122 |
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
@@ -125,7 +130,8 @@ pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_ma
|
|
125 |
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
126 |
conv = copy.deepcopy(conv_templates[conv_template])
|
127 |
conv.append_message(conv.roles[0], target)
|
128 |
-
conv.append_message(conv.roles[1], "
|
|
|
129 |
prompt_target = conv.get_prompt()
|
130 |
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
131 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
@@ -138,7 +144,8 @@ print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed
|
|
138 |
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
139 |
conv = copy.deepcopy(conv_templates[conv_template])
|
140 |
conv.append_message(conv.roles[0], neg_string)
|
141 |
-
conv.append_message(conv.roles[1], "
|
|
|
142 |
neg_string = conv.get_prompt()
|
143 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
144 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
|
|
1 |
---
|
2 |
+
pipeline_tag: image-text-to-text
|
3 |
library_name: transformers
|
4 |
+
license: apache-2.0
|
5 |
language:
|
6 |
- en
|
7 |
tags:
|
8 |
- Sentence Similarity
|
9 |
- Embedding
|
10 |
+
- zero-shot-image-classification
|
11 |
+
- video-text-to-text
|
|
|
12 |
---
|
13 |
+
|
14 |
# LLaVE-2B
|
15 |
|
16 |
## Model Summary
|
|
|
18 |
The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
|
19 |
|
20 |
- **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
|
21 |
+
- **Paper:** [LLaVE](https://huggingface.co/papers/2503.04812)
|
22 |
|
23 |
## Train/Eval Data
|
24 |
- Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
|
|
|
82 |
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
83 |
conv = copy.deepcopy(conv_templates[conv_template])
|
84 |
conv.append_message(conv.roles[0], question)
|
85 |
+
conv.append_message(conv.roles[1], "
|
86 |
+
")
|
87 |
prompt_question = conv.get_prompt()
|
88 |
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
89 |
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
|
|
93 |
target_string = "A cat and a dog"
|
94 |
conv = copy.deepcopy(conv_templates[conv_template])
|
95 |
conv.append_message(conv.roles[0], target_string)
|
96 |
+
conv.append_message(conv.roles[1], "
|
97 |
+
")
|
98 |
target_string = conv.get_prompt()
|
99 |
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
100 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
|
106 |
neg_string = "A cat and a tiger"
|
107 |
conv = copy.deepcopy(conv_templates[conv_template])
|
108 |
conv.append_message(conv.roles[0], neg_string)
|
109 |
+
conv.append_message(conv.roles[1], "
|
110 |
+
")
|
111 |
neg_string = conv.get_prompt()
|
112 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
113 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
|
|
120 |
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
121 |
conv = copy.deepcopy(conv_templates[conv_template])
|
122 |
conv.append_message(conv.roles[0], pos_string)
|
123 |
+
conv.append_message(conv.roles[1], "
|
124 |
+
")
|
125 |
pos_string = conv.get_prompt()
|
126 |
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
127 |
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
|
|
130 |
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
131 |
conv = copy.deepcopy(conv_templates[conv_template])
|
132 |
conv.append_message(conv.roles[0], target)
|
133 |
+
conv.append_message(conv.roles[1], "
|
134 |
+
")
|
135 |
prompt_target = conv.get_prompt()
|
136 |
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
137 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
|
144 |
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
145 |
conv = copy.deepcopy(conv_templates[conv_template])
|
146 |
conv.append_message(conv.roles[0], neg_string)
|
147 |
+
conv.append_message(conv.roles[1], "
|
148 |
+
")
|
149 |
neg_string = conv.get_prompt()
|
150 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
151 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|