zhibinlan
/

LLaVE-2B

@@ -1,15 +1,16 @@
 ---
-license: apache-2.0
 library_name: transformers
 language:
 - en
 tags:
 - Sentence Similarity
 - Embedding
-- zero-shot-image-classification
-- video-text-to-text
-pipeline_tag: image-text-to-text
 ---
 # LLaVE-2B
 ## Model Summary
@@ -17,7 +18,7 @@ pipeline_tag: image-text-to-text
 The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
 - **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
-- **Paper:** [LLaVE](https://arxiv.org/abs/2503.04812)
 ## Train/Eval Data
  - Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
@@ -81,7 +82,8 @@ conv_template = "qwen_1_5"  # Make sure you use correct chat template for differ
 question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
-conv.append_message(conv.roles[1], "\n")
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=input_ids.ne(tokenizer.pad_token_id)
@@ -91,7 +93,8 @@ query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=atten
 target_string = "A cat and a dog"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target_string)
-conv.append_message(conv.roles[1], "\n")
 target_string = conv.get_prompt()
 target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -103,7 +106,8 @@ print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
 neg_string = "A cat and a tiger"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
-conv.append_message(conv.roles[1], "\n")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
@@ -116,7 +120,8 @@ print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
 pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], pos_string)
-conv.append_message(conv.roles[1], "\n")
 pos_string = conv.get_prompt()
 pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
 attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
@@ -125,7 +130,8 @@ pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_ma
 target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target)
-conv.append_message(conv.roles[1], "\n")
 prompt_target = conv.get_prompt()
 target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -138,7 +144,8 @@ print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed
 neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
-conv.append_message(conv.roles[1], "\n")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)

 ---
+pipeline_tag: image-text-to-text
 library_name: transformers
+license: apache-2.0
 language:
 - en
 tags:
 - Sentence Similarity
 - Embedding
+- zero-shot-image-classification
+- video-text-to-text
 ---
 # LLaVE-2B
 ## Model Summary
 The LLaVE models are 2B parameter multimodal embedding models based on the Aquila-VL-2B model with a context window of 4K tokens.
 - **Repository:** [LLaVE](https://github.com/DeepLearnXMU/LLaVE)
+- **Paper:** [LLaVE](https://huggingface.co/papers/2503.04812)
 ## Train/Eval Data
  - Train data: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
 question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], "
+")
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=input_ids.ne(tokenizer.pad_token_id)
 target_string = "A cat and a dog"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target_string)
+conv.append_message(conv.roles[1], "
+")
 target_string = conv.get_prompt()
 target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 neg_string = "A cat and a tiger"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
+conv.append_message(conv.roles[1], "
+")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
 pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], pos_string)
+conv.append_message(conv.roles[1], "
+")
 pos_string = conv.get_prompt()
 pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
 attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
 target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target)
+conv.append_message(conv.roles[1], "
+")
 prompt_target = conv.get_prompt()
 target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
+conv.append_message(conv.roles[1], "
+")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)