Improve model card: Add link to paper, code, correct pipeline tag, reorder metadata

#1
by nielsr HF staff - opened
Files changed (1) hide show
  1. README.md +18 -11
README.md CHANGED
@@ -1,15 +1,16 @@
1
  ---
2
- license: apache-2.0
3
  library_name: transformers
 
4
  language:
5
  - en
6
  tags:
7
  - Sentence Similarity
8
  - Embedding
9
- - zero-shot-image-classification
10
- - video-text-to-text
11
- pipeline_tag: image-text-to-text
12
  ---
 
13
  # LLaVE-2B
14
 
15
  ## Model Summary
@@ -17,7 +18,7 @@ pipeline_tag: image-text-to-text
17
  The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
18
 
19
  - **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
20
- - **Paper:** [LLaVE](https://arxiv.org/abs/2503.04812)
21
 
22
  ## Train/Eval Data
23
  - Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
@@ -81,7 +82,8 @@ conv_template = "qwen_1_5" # Make sure you use correct chat template for differ
81
  question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
82
  conv = copy.deepcopy(conv_templates[conv_template])
83
  conv.append_message(conv.roles[0], question)
84
- conv.append_message(conv.roles[1], "\n")
 
85
  prompt_question = conv.get_prompt()
86
  input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
87
  attention_mask=input_ids.ne(tokenizer.pad_token_id)
@@ -91,7 +93,8 @@ query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=atten
91
  target_string = "A cat and a dog"
92
  conv = copy.deepcopy(conv_templates[conv_template])
93
  conv.append_message(conv.roles[0], target_string)
94
- conv.append_message(conv.roles[1], "\n")
 
95
  target_string = conv.get_prompt()
96
  target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
97
  attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -103,7 +106,8 @@ print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
103
  neg_string = "A cat and a tiger"
104
  conv = copy.deepcopy(conv_templates[conv_template])
105
  conv.append_message(conv.roles[0], neg_string)
106
- conv.append_message(conv.roles[1], "\n")
 
107
  neg_string = conv.get_prompt()
108
  neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
109
  attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
@@ -116,7 +120,8 @@ print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
116
  pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
117
  conv = copy.deepcopy(conv_templates[conv_template])
118
  conv.append_message(conv.roles[0], pos_string)
119
- conv.append_message(conv.roles[1], "\n")
 
120
  pos_string = conv.get_prompt()
121
  pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
122
  attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
@@ -125,7 +130,8 @@ pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_ma
125
  target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
126
  conv = copy.deepcopy(conv_templates[conv_template])
127
  conv.append_message(conv.roles[0], target)
128
- conv.append_message(conv.roles[1], "\n")
 
129
  prompt_target = conv.get_prompt()
130
  target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
131
  attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -138,7 +144,8 @@ print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed
138
  neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
139
  conv = copy.deepcopy(conv_templates[conv_template])
140
  conv.append_message(conv.roles[0], neg_string)
141
- conv.append_message(conv.roles[1], "\n")
 
142
  neg_string = conv.get_prompt()
143
  neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
144
  attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
 
1
  ---
2
+ pipeline_tag: image-text-to-text
3
  library_name: transformers
4
+ license: apache-2.0
5
  language:
6
  - en
7
  tags:
8
  - Sentence Similarity
9
  - Embedding
10
+ - zero-shot-image-classification
11
+ - video-text-to-text
 
12
  ---
13
+
14
  # LLaVE-2B
15
 
16
  ## Model Summary
 
18
  The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
19
 
20
  - **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
21
+ - **Paper:** [LLaVE](https://huggingface.co/papers/2503.04812)
22
 
23
  ## Train/Eval Data
24
  - Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
 
82
  question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
83
  conv = copy.deepcopy(conv_templates[conv_template])
84
  conv.append_message(conv.roles[0], question)
85
+ conv.append_message(conv.roles[1], "
86
+ ")
87
  prompt_question = conv.get_prompt()
88
  input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
89
  attention_mask=input_ids.ne(tokenizer.pad_token_id)
 
93
  target_string = "A cat and a dog"
94
  conv = copy.deepcopy(conv_templates[conv_template])
95
  conv.append_message(conv.roles[0], target_string)
96
+ conv.append_message(conv.roles[1], "
97
+ ")
98
  target_string = conv.get_prompt()
99
  target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
100
  attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 
106
  neg_string = "A cat and a tiger"
107
  conv = copy.deepcopy(conv_templates[conv_template])
108
  conv.append_message(conv.roles[0], neg_string)
109
+ conv.append_message(conv.roles[1], "
110
+ ")
111
  neg_string = conv.get_prompt()
112
  neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
113
  attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
 
120
  pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
121
  conv = copy.deepcopy(conv_templates[conv_template])
122
  conv.append_message(conv.roles[0], pos_string)
123
+ conv.append_message(conv.roles[1], "
124
+ ")
125
  pos_string = conv.get_prompt()
126
  pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
127
  attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
 
130
  target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
131
  conv = copy.deepcopy(conv_templates[conv_template])
132
  conv.append_message(conv.roles[0], target)
133
+ conv.append_message(conv.roles[1], "
134
+ ")
135
  prompt_target = conv.get_prompt()
136
  target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
137
  attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 
144
  neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
145
  conv = copy.deepcopy(conv_templates[conv_template])
146
  conv.append_message(conv.roles[0], neg_string)
147
+ conv.append_message(conv.roles[1], "
148
+ ")
149
  neg_string = conv.get_prompt()
150
  neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
151
  attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)