John6666
/

llama-joycaption-alpha-two-hf-llava-nf4

@@ -45,9 +45,9 @@ parser.add_argument("--top-p", type=lambda x: none_or_type(x, float), default=0.
 parser.add_argument("--top-k", type=lambda x: none_or_type(x, int), default=None, help="Top-k sampling")
 parser.add_argument("--max-new-tokens", type=int, default=256, help="Maximum length of the generated caption (in tokens)")
 parser.add_argument("--num-workers", type=int, default=4, help="Number of workers loading images in parallel")
-parser.add_argument("--model", type=str, default="fancyfeast/llama-joycaption-alpha-two-hf-llava", help="Model to use")
-#parser.add_argument("--model", type=str, default="John6666/llama-joycaption-alpha-two-hf-llava-nf4", help="Model to use")
-parser.add_argument("--nf4", action="store_true", default=False, help="Use NF4 (default: bfloat16)")
 PIL.Image.MAX_IMAGE_PIXELS = 933120000   # Quiets Pillow from giving warnings on really large images (WARNING: Exposes a risk of DoS from malicious images)
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -89,8 +89,12 @@ def main():
                                     bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=True)
     assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
-    if IS_NF4: llava_model = LlavaForConditionalGeneration.from_pretrained(args.model, quantization_config=nf4_config, torch_dtype="bfloat16", device_map=device)
-    else: llava_model = LlavaForConditionalGeneration.from_pretrained(args.model, torch_dtype="bfloat16", device_map=device)
     assert isinstance(llava_model, LlavaForConditionalGeneration)
     dataset = ImageDataset(prompts, image_paths, tokenizer, llava_model.config.image_token_index, llava_model.config.image_seq_length)
@@ -104,6 +108,7 @@ def main():
         vision_dtype = llava_model.vision_tower.vision_model.embeddings.patch_embedding.weight.dtype
         vision_device = llava_model.vision_tower.vision_model.embeddings.patch_embedding.weight.device
         language_device = llava_model.language_model.get_input_embeddings().weight.device
         # Move to GPU
         pixel_values = batch['pixel_values'].to(vision_device, non_blocking=True)
@@ -336,4 +341,5 @@ if __name__ == "__main__":
 # https://github.com/huggingface/peft/issues/156
 # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1331
-# https://github.com/huggingface/peft/issues/1831

 parser.add_argument("--top-k", type=lambda x: none_or_type(x, int), default=None, help="Top-k sampling")
 parser.add_argument("--max-new-tokens", type=int, default=256, help="Maximum length of the generated caption (in tokens)")
 parser.add_argument("--num-workers", type=int, default=4, help="Number of workers loading images in parallel")
+#parser.add_argument("--model", type=str, default="fancyfeast/llama-joycaption-alpha-two-hf-llava", help="Model to use")
+parser.add_argument("--model", type=str, default="John6666/llama-joycaption-alpha-two-hf-llava-nf4", help="Model to use")
+parser.add_argument("--nf4", action="store_true", default=True, help="Use NF4 (default: bfloat16)")
 PIL.Image.MAX_IMAGE_PIXELS = 933120000   # Quiets Pillow from giving warnings on really large images (WARNING: Exposes a risk of DoS from malicious images)
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
                                     bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=True)
     assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
+    if IS_NF4:
+        llava_model = LlavaForConditionalGeneration.from_pretrained(args.model, torch_dtype="bfloat16", quantization_config=nf4_config).eval()
+        # https://github.com/fpgaminer/joycaption/issues/3#issuecomment-2619253277
+        attention = llava_model.vision_tower.vision_model.head.attention
+        attention.out_proj = torch.nn.Linear(attention.embed_dim, attention.embed_dim, device=llava_model.device, dtype=torch.bfloat16)
+    else: llava_model = LlavaForConditionalGeneration.from_pretrained(args.model, torch_dtype="bfloat16", device_map="auto").eval()
     assert isinstance(llava_model, LlavaForConditionalGeneration)
     dataset = ImageDataset(prompts, image_paths, tokenizer, llava_model.config.image_token_index, llava_model.config.image_seq_length)
         vision_dtype = llava_model.vision_tower.vision_model.embeddings.patch_embedding.weight.dtype
         vision_device = llava_model.vision_tower.vision_model.embeddings.patch_embedding.weight.device
         language_device = llava_model.language_model.get_input_embeddings().weight.device
+        print(vision_device, vision_dtype, language_device)
         # Move to GPU
         pixel_values = batch['pixel_values'].to(vision_device, non_blocking=True)
 # https://github.com/huggingface/peft/issues/156
 # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1331
+# https://github.com/huggingface/peft/issues/1831
+# https://github.com/fpgaminer/joycaption/issues/3