1inkusFace commited on
Commit
6fbeb7f
·
verified ·
1 Parent(s): d877871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -18
app.py CHANGED
@@ -21,7 +21,7 @@ from ip_adapter import IPAdapterXL
21
  from huggingface_hub import snapshot_download
22
  import torch
23
  from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
24
- from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
25
 
26
  torch.backends.cuda.matmul.allow_tf32 = False
27
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -160,14 +160,15 @@ pipe = load_and_prepare_model()
160
  # text models
161
  #checkpoint = "microsoft/Phi-3.5-mini-instruct"
162
  checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
163
- captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
164
- #captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
165
  #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
166
  model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
167
  processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
168
  txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
169
  txt_tokenizer.tokenizer_legacy=False
170
- model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
 
171
 
172
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
173
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
@@ -282,10 +283,10 @@ def expand_prompt(prompt):
282
  encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
283
  encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
284
  # Ensure all values are on the correct device
285
- input_ids = encoded_inputs["input_ids"].to(device)
286
- input_ids_2 = encoded_inputs_2["input_ids"].to(device)
287
- attention_mask = encoded_inputs["attention_mask"].to(device)
288
- attention_mask_2 = encoded_inputs_2["attention_mask"].to(device)
289
  print("-- tokenize prompt --")
290
  # Google T5
291
  #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
@@ -352,15 +353,15 @@ def generate_30(
352
  sd_image_a.resize((height,width), Image.LANCZOS)
353
  caption=[]
354
  caption_2=[]
355
- caption.append(captioner(sd_image_a))
356
- #caption.append(captioner_2(sd_image_a))
357
  #caption.append(captioner_3(sd_image_a))
358
  caption_2.append(captioning(sd_image_a))
359
  if latent_file_2 is not None: # Check if a latent file is provided
360
  sd_image_b = Image.open(latent_file_2.name).convert('RGB')
361
  sd_image_b.resize((height,width), Image.LANCZOS)
362
- caption.append(captioner(sd_image_b))
363
- #caption.append(captioner_2(sd_image_b))
364
  #caption.append(captioner_3(sd_image_b))
365
  caption_2.append(captioning(sd_image_b))
366
  else:
@@ -368,8 +369,8 @@ def generate_30(
368
  if latent_file_3 is not None: # Check if a latent file is provided
369
  sd_image_c = Image.open(latent_file_3.name).convert('RGB')
370
  sd_image_c.resize((height,width), Image.LANCZOS)
371
- caption.append(captioner(sd_image_c))
372
- #caption.append(captioner_2(sd_image_c))
373
  #caption.append(captioner_3(sd_image_c))
374
  caption_2.append(captioning(sd_image_c))
375
  else:
@@ -377,8 +378,8 @@ def generate_30(
377
  if latent_file_4 is not None: # Check if a latent file is provided
378
  sd_image_d = Image.open(latent_file_4.name).convert('RGB')
379
  sd_image_d.resize((height,width), Image.LANCZOS)
380
- caption.append(captioner(sd_image_d))
381
- #caption.append(captioner_2(sd_image_d))
382
  #caption.append(captioner_3(sd_image_d))
383
  caption_2.append(captioning(sd_image_d))
384
  else:
@@ -386,8 +387,8 @@ def generate_30(
386
  if latent_file_5 is not None: # Check if a latent file is provided
387
  sd_image_e = Image.open(latent_file_5.name).convert('RGB')
388
  sd_image_e.resize((height,width), Image.LANCZOS)
389
- caption.append(captioner(sd_image_e))
390
- #caption.append(captioner_2(sd_image_e))
391
  #caption.append(captioner_3(sd_image_e))
392
  caption_2.append(captioning(sd_image_e))
393
  else:
 
21
  from huggingface_hub import snapshot_download
22
  import torch
23
  from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline, Phi3ForCausalLM
25
 
26
  torch.backends.cuda.matmul.allow_tf32 = False
27
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 
160
  # text models
161
  #checkpoint = "microsoft/Phi-3.5-mini-instruct"
162
  checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
163
+ #captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda:0', task="image-to-text")
164
+ captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
165
  #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
166
  model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
167
  processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
168
  txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
169
  txt_tokenizer.tokenizer_legacy=False
170
+ model = Phi3ForCausalLM.from_pretrained(checkpoint).to('cuda:0')
171
+ #model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
172
 
173
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
174
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
 
283
  encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
284
  encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
285
  # Ensure all values are on the correct device
286
+ input_ids = encoded_inputs["input_ids"].to("cuda:0")
287
+ input_ids_2 = encoded_inputs_2["input_ids"].to("cuda:0")
288
+ attention_mask = encoded_inputs["attention_mask"].to("cuda:0")
289
+ attention_mask_2 = encoded_inputs_2["attention_mask"].to("cuda:0")
290
  print("-- tokenize prompt --")
291
  # Google T5
292
  #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 
353
  sd_image_a.resize((height,width), Image.LANCZOS)
354
  caption=[]
355
  caption_2=[]
356
+ #caption.append(captioner(sd_image_a))
357
+ caption.append(captioner_2(sd_image_a))
358
  #caption.append(captioner_3(sd_image_a))
359
  caption_2.append(captioning(sd_image_a))
360
  if latent_file_2 is not None: # Check if a latent file is provided
361
  sd_image_b = Image.open(latent_file_2.name).convert('RGB')
362
  sd_image_b.resize((height,width), Image.LANCZOS)
363
+ #caption.append(captioner(sd_image_b))
364
+ caption.append(captioner_2(sd_image_b))
365
  #caption.append(captioner_3(sd_image_b))
366
  caption_2.append(captioning(sd_image_b))
367
  else:
 
369
  if latent_file_3 is not None: # Check if a latent file is provided
370
  sd_image_c = Image.open(latent_file_3.name).convert('RGB')
371
  sd_image_c.resize((height,width), Image.LANCZOS)
372
+ #caption.append(captioner(sd_image_c))
373
+ caption.append(captioner_2(sd_image_c))
374
  #caption.append(captioner_3(sd_image_c))
375
  caption_2.append(captioning(sd_image_c))
376
  else:
 
378
  if latent_file_4 is not None: # Check if a latent file is provided
379
  sd_image_d = Image.open(latent_file_4.name).convert('RGB')
380
  sd_image_d.resize((height,width), Image.LANCZOS)
381
+ #caption.append(captioner(sd_image_d))
382
+ caption.append(captioner_2(sd_image_d))
383
  #caption.append(captioner_3(sd_image_d))
384
  caption_2.append(captioning(sd_image_d))
385
  else:
 
387
  if latent_file_5 is not None: # Check if a latent file is provided
388
  sd_image_e = Image.open(latent_file_5.name).convert('RGB')
389
  sd_image_e.resize((height,width), Image.LANCZOS)
390
+ #caption.append(captioner(sd_image_e))
391
+ caption.append(captioner_2(sd_image_e))
392
  #caption.append(captioner_3(sd_image_e))
393
  caption_2.append(captioning(sd_image_e))
394
  else: