Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -21,7 +21,7 @@ from ip_adapter import IPAdapterXL | |
| 21 | 
             
            from huggingface_hub import snapshot_download
         | 
| 22 | 
             
            import torch
         | 
| 23 | 
             
            from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
         | 
| 24 | 
            -
            from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
         | 
| 25 |  | 
| 26 | 
             
            torch.backends.cuda.matmul.allow_tf32 = False
         | 
| 27 | 
             
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
         | 
| @@ -160,14 +160,15 @@ pipe = load_and_prepare_model() | |
| 160 | 
             
            # text models
         | 
| 161 | 
             
            #checkpoint = "microsoft/Phi-3.5-mini-instruct"
         | 
| 162 | 
             
            checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
         | 
| 163 | 
            -
            captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
         | 
| 164 | 
            -
             | 
| 165 | 
             
            #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
         | 
| 166 | 
             
            model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
         | 
| 167 | 
             
            processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
         | 
| 168 | 
             
            txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
         | 
| 169 | 
             
            txt_tokenizer.tokenizer_legacy=False
         | 
| 170 | 
            -
            model =  | 
|  | |
| 171 |  | 
| 172 | 
             
            ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
         | 
| 173 | 
             
            text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
         | 
| @@ -282,10 +283,10 @@ def expand_prompt(prompt): | |
| 282 | 
             
                    encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
         | 
| 283 | 
             
                    encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
         | 
| 284 | 
             
                    # Ensure all values are on the correct device
         | 
| 285 | 
            -
                    input_ids = encoded_inputs["input_ids"].to( | 
| 286 | 
            -
                    input_ids_2 = encoded_inputs_2["input_ids"].to( | 
| 287 | 
            -
                    attention_mask = encoded_inputs["attention_mask"].to( | 
| 288 | 
            -
                    attention_mask_2 = encoded_inputs_2["attention_mask"].to( | 
| 289 | 
             
                    print("-- tokenize prompt --")
         | 
| 290 | 
             
                      # Google T5
         | 
| 291 | 
             
                    #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
         | 
| @@ -352,15 +353,15 @@ def generate_30( | |
| 352 | 
             
                    sd_image_a.resize((height,width), Image.LANCZOS)
         | 
| 353 | 
             
                    caption=[]
         | 
| 354 | 
             
                    caption_2=[]
         | 
| 355 | 
            -
                    caption.append(captioner(sd_image_a))
         | 
| 356 | 
            -
                     | 
| 357 | 
             
                    #caption.append(captioner_3(sd_image_a))
         | 
| 358 | 
             
                    caption_2.append(captioning(sd_image_a))
         | 
| 359 | 
             
                    if latent_file_2 is not None:  # Check if a latent file is provided
         | 
| 360 | 
             
                        sd_image_b = Image.open(latent_file_2.name).convert('RGB')
         | 
| 361 | 
             
                        sd_image_b.resize((height,width), Image.LANCZOS)
         | 
| 362 | 
            -
                        caption.append(captioner(sd_image_b))
         | 
| 363 | 
            -
                         | 
| 364 | 
             
                        #caption.append(captioner_3(sd_image_b))
         | 
| 365 | 
             
                        caption_2.append(captioning(sd_image_b))
         | 
| 366 | 
             
                    else:
         | 
| @@ -368,8 +369,8 @@ def generate_30( | |
| 368 | 
             
                    if latent_file_3 is not None:  # Check if a latent file is provided
         | 
| 369 | 
             
                        sd_image_c = Image.open(latent_file_3.name).convert('RGB')
         | 
| 370 | 
             
                        sd_image_c.resize((height,width), Image.LANCZOS)
         | 
| 371 | 
            -
                        caption.append(captioner(sd_image_c))
         | 
| 372 | 
            -
                         | 
| 373 | 
             
                        #caption.append(captioner_3(sd_image_c))
         | 
| 374 | 
             
                        caption_2.append(captioning(sd_image_c))
         | 
| 375 | 
             
                    else:
         | 
| @@ -377,8 +378,8 @@ def generate_30( | |
| 377 | 
             
                    if latent_file_4 is not None:  # Check if a latent file is provided
         | 
| 378 | 
             
                        sd_image_d = Image.open(latent_file_4.name).convert('RGB')
         | 
| 379 | 
             
                        sd_image_d.resize((height,width), Image.LANCZOS)
         | 
| 380 | 
            -
                        caption.append(captioner(sd_image_d))
         | 
| 381 | 
            -
                         | 
| 382 | 
             
                        #caption.append(captioner_3(sd_image_d))
         | 
| 383 | 
             
                        caption_2.append(captioning(sd_image_d))
         | 
| 384 | 
             
                    else:
         | 
| @@ -386,8 +387,8 @@ def generate_30( | |
| 386 | 
             
                    if latent_file_5 is not None:  # Check if a latent file is provided
         | 
| 387 | 
             
                        sd_image_e = Image.open(latent_file_5.name).convert('RGB')
         | 
| 388 | 
             
                        sd_image_e.resize((height,width), Image.LANCZOS)
         | 
| 389 | 
            -
                        caption.append(captioner(sd_image_e))
         | 
| 390 | 
            -
                         | 
| 391 | 
             
                        #caption.append(captioner_3(sd_image_e))
         | 
| 392 | 
             
                        caption_2.append(captioning(sd_image_e))
         | 
| 393 | 
             
                    else:
         | 
|  | |
| 21 | 
             
            from huggingface_hub import snapshot_download
         | 
| 22 | 
             
            import torch
         | 
| 23 | 
             
            from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
         | 
| 24 | 
            +
            from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline, Phi3ForCausalLM
         | 
| 25 |  | 
| 26 | 
             
            torch.backends.cuda.matmul.allow_tf32 = False
         | 
| 27 | 
             
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
         | 
|  | |
| 160 | 
             
            # text models
         | 
| 161 | 
             
            #checkpoint = "microsoft/Phi-3.5-mini-instruct"
         | 
| 162 | 
             
            checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
         | 
| 163 | 
            +
            #captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda:0', task="image-to-text")
         | 
| 164 | 
            +
            captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
         | 
| 165 | 
             
            #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
         | 
| 166 | 
             
            model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
         | 
| 167 | 
             
            processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
         | 
| 168 | 
             
            txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
         | 
| 169 | 
             
            txt_tokenizer.tokenizer_legacy=False
         | 
| 170 | 
            +
            model = Phi3ForCausalLM.from_pretrained(checkpoint).to('cuda:0')
         | 
| 171 | 
            +
            #model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
         | 
| 172 |  | 
| 173 | 
             
            ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
         | 
| 174 | 
             
            text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
         | 
|  | |
| 283 | 
             
                    encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
         | 
| 284 | 
             
                    encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
         | 
| 285 | 
             
                    # Ensure all values are on the correct device
         | 
| 286 | 
            +
                    input_ids = encoded_inputs["input_ids"].to("cuda:0")
         | 
| 287 | 
            +
                    input_ids_2 = encoded_inputs_2["input_ids"].to("cuda:0")
         | 
| 288 | 
            +
                    attention_mask = encoded_inputs["attention_mask"].to("cuda:0")
         | 
| 289 | 
            +
                    attention_mask_2 = encoded_inputs_2["attention_mask"].to("cuda:0")
         | 
| 290 | 
             
                    print("-- tokenize prompt --")
         | 
| 291 | 
             
                      # Google T5
         | 
| 292 | 
             
                    #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
         | 
|  | |
| 353 | 
             
                    sd_image_a.resize((height,width), Image.LANCZOS)
         | 
| 354 | 
             
                    caption=[]
         | 
| 355 | 
             
                    caption_2=[]
         | 
| 356 | 
            +
                    #caption.append(captioner(sd_image_a))
         | 
| 357 | 
            +
                    caption.append(captioner_2(sd_image_a))
         | 
| 358 | 
             
                    #caption.append(captioner_3(sd_image_a))
         | 
| 359 | 
             
                    caption_2.append(captioning(sd_image_a))
         | 
| 360 | 
             
                    if latent_file_2 is not None:  # Check if a latent file is provided
         | 
| 361 | 
             
                        sd_image_b = Image.open(latent_file_2.name).convert('RGB')
         | 
| 362 | 
             
                        sd_image_b.resize((height,width), Image.LANCZOS)
         | 
| 363 | 
            +
                        #caption.append(captioner(sd_image_b))
         | 
| 364 | 
            +
                        caption.append(captioner_2(sd_image_b))
         | 
| 365 | 
             
                        #caption.append(captioner_3(sd_image_b))
         | 
| 366 | 
             
                        caption_2.append(captioning(sd_image_b))
         | 
| 367 | 
             
                    else:
         | 
|  | |
| 369 | 
             
                    if latent_file_3 is not None:  # Check if a latent file is provided
         | 
| 370 | 
             
                        sd_image_c = Image.open(latent_file_3.name).convert('RGB')
         | 
| 371 | 
             
                        sd_image_c.resize((height,width), Image.LANCZOS)
         | 
| 372 | 
            +
                        #caption.append(captioner(sd_image_c))
         | 
| 373 | 
            +
                        caption.append(captioner_2(sd_image_c))
         | 
| 374 | 
             
                        #caption.append(captioner_3(sd_image_c))
         | 
| 375 | 
             
                        caption_2.append(captioning(sd_image_c))
         | 
| 376 | 
             
                    else:
         | 
|  | |
| 378 | 
             
                    if latent_file_4 is not None:  # Check if a latent file is provided
         | 
| 379 | 
             
                        sd_image_d = Image.open(latent_file_4.name).convert('RGB')
         | 
| 380 | 
             
                        sd_image_d.resize((height,width), Image.LANCZOS)
         | 
| 381 | 
            +
                        #caption.append(captioner(sd_image_d))
         | 
| 382 | 
            +
                        caption.append(captioner_2(sd_image_d))
         | 
| 383 | 
             
                        #caption.append(captioner_3(sd_image_d))
         | 
| 384 | 
             
                        caption_2.append(captioning(sd_image_d))
         | 
| 385 | 
             
                    else:
         | 
|  | |
| 387 | 
             
                    if latent_file_5 is not None:  # Check if a latent file is provided
         | 
| 388 | 
             
                        sd_image_e = Image.open(latent_file_5.name).convert('RGB')
         | 
| 389 | 
             
                        sd_image_e.resize((height,width), Image.LANCZOS)
         | 
| 390 | 
            +
                        #caption.append(captioner(sd_image_e))
         | 
| 391 | 
            +
                        caption.append(captioner_2(sd_image_e))
         | 
| 392 | 
             
                        #caption.append(captioner_3(sd_image_e))
         | 
| 393 | 
             
                        caption_2.append(captioning(sd_image_e))
         | 
| 394 | 
             
                    else:
         | 
