Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ from ip_adapter import IPAdapterXL
|
|
| 21 |
from huggingface_hub import snapshot_download
|
| 22 |
import torch
|
| 23 |
from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
| 24 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
|
| 25 |
|
| 26 |
torch.backends.cuda.matmul.allow_tf32 = False
|
| 27 |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
|
|
@@ -160,14 +160,15 @@ pipe = load_and_prepare_model()
|
|
| 160 |
# text models
|
| 161 |
#checkpoint = "microsoft/Phi-3.5-mini-instruct"
|
| 162 |
checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
|
| 163 |
-
captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
|
| 164 |
-
|
| 165 |
#captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
|
| 166 |
model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
|
| 167 |
processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
|
| 168 |
txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
|
| 169 |
txt_tokenizer.tokenizer_legacy=False
|
| 170 |
-
model =
|
|
|
|
| 171 |
|
| 172 |
ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
|
| 173 |
text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
|
|
@@ -282,10 +283,10 @@ def expand_prompt(prompt):
|
|
| 282 |
encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
|
| 283 |
encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
|
| 284 |
# Ensure all values are on the correct device
|
| 285 |
-
input_ids = encoded_inputs["input_ids"].to(
|
| 286 |
-
input_ids_2 = encoded_inputs_2["input_ids"].to(
|
| 287 |
-
attention_mask = encoded_inputs["attention_mask"].to(
|
| 288 |
-
attention_mask_2 = encoded_inputs_2["attention_mask"].to(
|
| 289 |
print("-- tokenize prompt --")
|
| 290 |
# Google T5
|
| 291 |
#input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
|
|
@@ -352,15 +353,15 @@ def generate_30(
|
|
| 352 |
sd_image_a.resize((height,width), Image.LANCZOS)
|
| 353 |
caption=[]
|
| 354 |
caption_2=[]
|
| 355 |
-
caption.append(captioner(sd_image_a))
|
| 356 |
-
|
| 357 |
#caption.append(captioner_3(sd_image_a))
|
| 358 |
caption_2.append(captioning(sd_image_a))
|
| 359 |
if latent_file_2 is not None: # Check if a latent file is provided
|
| 360 |
sd_image_b = Image.open(latent_file_2.name).convert('RGB')
|
| 361 |
sd_image_b.resize((height,width), Image.LANCZOS)
|
| 362 |
-
caption.append(captioner(sd_image_b))
|
| 363 |
-
|
| 364 |
#caption.append(captioner_3(sd_image_b))
|
| 365 |
caption_2.append(captioning(sd_image_b))
|
| 366 |
else:
|
|
@@ -368,8 +369,8 @@ def generate_30(
|
|
| 368 |
if latent_file_3 is not None: # Check if a latent file is provided
|
| 369 |
sd_image_c = Image.open(latent_file_3.name).convert('RGB')
|
| 370 |
sd_image_c.resize((height,width), Image.LANCZOS)
|
| 371 |
-
caption.append(captioner(sd_image_c))
|
| 372 |
-
|
| 373 |
#caption.append(captioner_3(sd_image_c))
|
| 374 |
caption_2.append(captioning(sd_image_c))
|
| 375 |
else:
|
|
@@ -377,8 +378,8 @@ def generate_30(
|
|
| 377 |
if latent_file_4 is not None: # Check if a latent file is provided
|
| 378 |
sd_image_d = Image.open(latent_file_4.name).convert('RGB')
|
| 379 |
sd_image_d.resize((height,width), Image.LANCZOS)
|
| 380 |
-
caption.append(captioner(sd_image_d))
|
| 381 |
-
|
| 382 |
#caption.append(captioner_3(sd_image_d))
|
| 383 |
caption_2.append(captioning(sd_image_d))
|
| 384 |
else:
|
|
@@ -386,8 +387,8 @@ def generate_30(
|
|
| 386 |
if latent_file_5 is not None: # Check if a latent file is provided
|
| 387 |
sd_image_e = Image.open(latent_file_5.name).convert('RGB')
|
| 388 |
sd_image_e.resize((height,width), Image.LANCZOS)
|
| 389 |
-
caption.append(captioner(sd_image_e))
|
| 390 |
-
|
| 391 |
#caption.append(captioner_3(sd_image_e))
|
| 392 |
caption_2.append(captioning(sd_image_e))
|
| 393 |
else:
|
|
|
|
| 21 |
from huggingface_hub import snapshot_download
|
| 22 |
import torch
|
| 23 |
from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
| 24 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline, Phi3ForCausalLM
|
| 25 |
|
| 26 |
torch.backends.cuda.matmul.allow_tf32 = False
|
| 27 |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
|
|
|
|
| 160 |
# text models
|
| 161 |
#checkpoint = "microsoft/Phi-3.5-mini-instruct"
|
| 162 |
checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
|
| 163 |
+
#captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda:0', task="image-to-text")
|
| 164 |
+
captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
|
| 165 |
#captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
|
| 166 |
model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
|
| 167 |
processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
|
| 168 |
txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
|
| 169 |
txt_tokenizer.tokenizer_legacy=False
|
| 170 |
+
model = Phi3ForCausalLM.from_pretrained(checkpoint).to('cuda:0')
|
| 171 |
+
#model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
|
| 172 |
|
| 173 |
ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
|
| 174 |
text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
|
|
|
|
| 283 |
encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
|
| 284 |
encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
|
| 285 |
# Ensure all values are on the correct device
|
| 286 |
+
input_ids = encoded_inputs["input_ids"].to("cuda:0")
|
| 287 |
+
input_ids_2 = encoded_inputs_2["input_ids"].to("cuda:0")
|
| 288 |
+
attention_mask = encoded_inputs["attention_mask"].to("cuda:0")
|
| 289 |
+
attention_mask_2 = encoded_inputs_2["attention_mask"].to("cuda:0")
|
| 290 |
print("-- tokenize prompt --")
|
| 291 |
# Google T5
|
| 292 |
#input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
|
|
|
|
| 353 |
sd_image_a.resize((height,width), Image.LANCZOS)
|
| 354 |
caption=[]
|
| 355 |
caption_2=[]
|
| 356 |
+
#caption.append(captioner(sd_image_a))
|
| 357 |
+
caption.append(captioner_2(sd_image_a))
|
| 358 |
#caption.append(captioner_3(sd_image_a))
|
| 359 |
caption_2.append(captioning(sd_image_a))
|
| 360 |
if latent_file_2 is not None: # Check if a latent file is provided
|
| 361 |
sd_image_b = Image.open(latent_file_2.name).convert('RGB')
|
| 362 |
sd_image_b.resize((height,width), Image.LANCZOS)
|
| 363 |
+
#caption.append(captioner(sd_image_b))
|
| 364 |
+
caption.append(captioner_2(sd_image_b))
|
| 365 |
#caption.append(captioner_3(sd_image_b))
|
| 366 |
caption_2.append(captioning(sd_image_b))
|
| 367 |
else:
|
|
|
|
| 369 |
if latent_file_3 is not None: # Check if a latent file is provided
|
| 370 |
sd_image_c = Image.open(latent_file_3.name).convert('RGB')
|
| 371 |
sd_image_c.resize((height,width), Image.LANCZOS)
|
| 372 |
+
#caption.append(captioner(sd_image_c))
|
| 373 |
+
caption.append(captioner_2(sd_image_c))
|
| 374 |
#caption.append(captioner_3(sd_image_c))
|
| 375 |
caption_2.append(captioning(sd_image_c))
|
| 376 |
else:
|
|
|
|
| 378 |
if latent_file_4 is not None: # Check if a latent file is provided
|
| 379 |
sd_image_d = Image.open(latent_file_4.name).convert('RGB')
|
| 380 |
sd_image_d.resize((height,width), Image.LANCZOS)
|
| 381 |
+
#caption.append(captioner(sd_image_d))
|
| 382 |
+
caption.append(captioner_2(sd_image_d))
|
| 383 |
#caption.append(captioner_3(sd_image_d))
|
| 384 |
caption_2.append(captioning(sd_image_d))
|
| 385 |
else:
|
|
|
|
| 387 |
if latent_file_5 is not None: # Check if a latent file is provided
|
| 388 |
sd_image_e = Image.open(latent_file_5.name).convert('RGB')
|
| 389 |
sd_image_e.resize((height,width), Image.LANCZOS)
|
| 390 |
+
#caption.append(captioner(sd_image_e))
|
| 391 |
+
caption.append(captioner_2(sd_image_e))
|
| 392 |
#caption.append(captioner_3(sd_image_e))
|
| 393 |
caption_2.append(captioning(sd_image_e))
|
| 394 |
else:
|