RealVis_v5.0_BF16_IP_B

Running on Zero

App Files Files Community

1inkusFace commited on Jan 20

Commit

6fbeb7f

verified ·

1 Parent(s): d877871

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -18

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from ip_adapter import IPAdapterXL
 from huggingface_hub import snapshot_download
 import torch
 from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -160,14 +160,15 @@ pipe = load_and_prepare_model()
 # text models
 #checkpoint = "microsoft/Phi-3.5-mini-instruct"
 checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
-captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
-#captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
 #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
 model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
 processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
 txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
 txt_tokenizer.tokenizer_legacy=False
-model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
 ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
 text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
@@ -282,10 +283,10 @@ def expand_prompt(prompt):
         encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
         encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
         # Ensure all values are on the correct device
-        input_ids = encoded_inputs["input_ids"].to(device)
-        input_ids_2 = encoded_inputs_2["input_ids"].to(device)
-        attention_mask = encoded_inputs["attention_mask"].to(device)
-        attention_mask_2 = encoded_inputs_2["attention_mask"].to(device)
         print("-- tokenize prompt --")
           # Google T5
         #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
@@ -352,15 +353,15 @@ def generate_30(
         sd_image_a.resize((height,width), Image.LANCZOS)
         caption=[]
         caption_2=[]
-        caption.append(captioner(sd_image_a))
-        #caption.append(captioner_2(sd_image_a))
         #caption.append(captioner_3(sd_image_a))
         caption_2.append(captioning(sd_image_a))
         if latent_file_2 is not None:  # Check if a latent file is provided
             sd_image_b = Image.open(latent_file_2.name).convert('RGB')
             sd_image_b.resize((height,width), Image.LANCZOS)
-            caption.append(captioner(sd_image_b))
-            #caption.append(captioner_2(sd_image_b))
             #caption.append(captioner_3(sd_image_b))
             caption_2.append(captioning(sd_image_b))
         else:
@@ -368,8 +369,8 @@ def generate_30(
         if latent_file_3 is not None:  # Check if a latent file is provided
             sd_image_c = Image.open(latent_file_3.name).convert('RGB')
             sd_image_c.resize((height,width), Image.LANCZOS)
-            caption.append(captioner(sd_image_c))
-            #caption.append(captioner_2(sd_image_c))
             #caption.append(captioner_3(sd_image_c))
             caption_2.append(captioning(sd_image_c))
         else:
@@ -377,8 +378,8 @@ def generate_30(
         if latent_file_4 is not None:  # Check if a latent file is provided
             sd_image_d = Image.open(latent_file_4.name).convert('RGB')
             sd_image_d.resize((height,width), Image.LANCZOS)
-            caption.append(captioner(sd_image_d))
-            #caption.append(captioner_2(sd_image_d))
             #caption.append(captioner_3(sd_image_d))
             caption_2.append(captioning(sd_image_d))
         else:
@@ -386,8 +387,8 @@ def generate_30(
         if latent_file_5 is not None:  # Check if a latent file is provided
             sd_image_e = Image.open(latent_file_5.name).convert('RGB')
             sd_image_e.resize((height,width), Image.LANCZOS)
-            caption.append(captioner(sd_image_e))
-            #caption.append(captioner_2(sd_image_e))
             #caption.append(captioner_3(sd_image_e))
             caption_2.append(captioning(sd_image_e))
         else:

 from huggingface_hub import snapshot_download
 import torch
 from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline, Phi3ForCausalLM
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 # text models
 #checkpoint = "microsoft/Phi-3.5-mini-instruct"
 checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
+#captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda:0', task="image-to-text")
+captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
 #captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
 model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
 processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
 txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map='cuda', add_prefix_space=False)
 txt_tokenizer.tokenizer_legacy=False
+model = Phi3ForCausalLM.from_pretrained(checkpoint).to('cuda:0')
+#model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda') #.to('cuda')
 ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
 text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
         encoded_inputs = txt_tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
         encoded_inputs_2 = txt_tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
         # Ensure all values are on the correct device
+        input_ids = encoded_inputs["input_ids"].to("cuda:0")
+        input_ids_2 = encoded_inputs_2["input_ids"].to("cuda:0")
+        attention_mask = encoded_inputs["attention_mask"].to("cuda:0")
+        attention_mask_2 = encoded_inputs_2["attention_mask"].to("cuda:0")
         print("-- tokenize prompt --")
           # Google T5
         #input_ids = txt_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
         sd_image_a.resize((height,width), Image.LANCZOS)
         caption=[]
         caption_2=[]
+        #caption.append(captioner(sd_image_a))
+        caption.append(captioner_2(sd_image_a))
         #caption.append(captioner_3(sd_image_a))
         caption_2.append(captioning(sd_image_a))
         if latent_file_2 is not None:  # Check if a latent file is provided
             sd_image_b = Image.open(latent_file_2.name).convert('RGB')
             sd_image_b.resize((height,width), Image.LANCZOS)
+            #caption.append(captioner(sd_image_b))
+            caption.append(captioner_2(sd_image_b))
             #caption.append(captioner_3(sd_image_b))
             caption_2.append(captioning(sd_image_b))
         else:
         if latent_file_3 is not None:  # Check if a latent file is provided
             sd_image_c = Image.open(latent_file_3.name).convert('RGB')
             sd_image_c.resize((height,width), Image.LANCZOS)
+            #caption.append(captioner(sd_image_c))
+            caption.append(captioner_2(sd_image_c))
             #caption.append(captioner_3(sd_image_c))
             caption_2.append(captioning(sd_image_c))
         else:
         if latent_file_4 is not None:  # Check if a latent file is provided
             sd_image_d = Image.open(latent_file_4.name).convert('RGB')
             sd_image_d.resize((height,width), Image.LANCZOS)
+            #caption.append(captioner(sd_image_d))
+            caption.append(captioner_2(sd_image_d))
             #caption.append(captioner_3(sd_image_d))
             caption_2.append(captioning(sd_image_d))
         else:
         if latent_file_5 is not None:  # Check if a latent file is provided
             sd_image_e = Image.open(latent_file_5.name).convert('RGB')
             sd_image_e.resize((height,width), Image.LANCZOS)
+            #caption.append(captioner(sd_image_e))
+            caption.append(captioner_2(sd_image_e))
             #caption.append(captioner_3(sd_image_e))
             caption_2.append(captioning(sd_image_e))
         else: