Update pipeline_stable_diffusion_3_ipa.py
Browse files
pipeline_stable_diffusion_3_ipa.py
CHANGED
@@ -1154,9 +1154,6 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1154 |
clip_image_embeds_1 = clip_image_embeds_1.to(device, dtype=dtype)
|
1155 |
clip_image_embeds_1 = self.image_encoder(clip_image_embeds_1, output_hidden_states=True).hidden_states[-2]
|
1156 |
print('encoder output shape: ', clip_image_embeds_1.shape)
|
1157 |
-
|
1158 |
-
print('projection model output shape: ', clip_image_embeds_1.shape)
|
1159 |
-
|
1160 |
clip_image_embeds_1 = clip_image_embeds_1 * scale_1
|
1161 |
image_prompt_embeds_list.append(clip_image_embeds_1)
|
1162 |
if clip_image_2 != None:
|
@@ -1199,11 +1196,11 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1199 |
clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list).mean(dim=1)
|
1200 |
#clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list, dim=1)
|
1201 |
print('catted embeds list with mean: ',clip_image_embeds_cat_list.shape)
|
1202 |
-
|
1203 |
-
|
1204 |
clip_image_embeds_view = clip_image_embeds_cat_list #.unsqueeze(0) # Added unsqueeze here instead
|
1205 |
-
|
1206 |
-
|
1207 |
print('catted viewed: ',clip_image_embeds_view.shape)
|
1208 |
zeros_tensor = torch.zeros_like(clip_image_embeds_view)
|
1209 |
#zeros_tensor = torch.zeros_like(clip_image_embeds_view[:, :image_prompt_embeds_list[0].shape[1], :]) # Make zeros tensor match the sequence length of a single image embedding
|
|
|
1154 |
clip_image_embeds_1 = clip_image_embeds_1.to(device, dtype=dtype)
|
1155 |
clip_image_embeds_1 = self.image_encoder(clip_image_embeds_1, output_hidden_states=True).hidden_states[-2]
|
1156 |
print('encoder output shape: ', clip_image_embeds_1.shape)
|
|
|
|
|
|
|
1157 |
clip_image_embeds_1 = clip_image_embeds_1 * scale_1
|
1158 |
image_prompt_embeds_list.append(clip_image_embeds_1)
|
1159 |
if clip_image_2 != None:
|
|
|
1196 |
clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list).mean(dim=1)
|
1197 |
#clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list, dim=1)
|
1198 |
print('catted embeds list with mean: ',clip_image_embeds_cat_list.shape)
|
1199 |
+
seq_len, _ = clip_image_embeds_cat_list.shape
|
1200 |
+
clip_image_embeds_cat_list_repeat = clip_image_embeds_cat_list.repeat(1, 1, 1)
|
1201 |
clip_image_embeds_view = clip_image_embeds_cat_list #.unsqueeze(0) # Added unsqueeze here instead
|
1202 |
+
print('catted embeds repeat: ',clip_image_embeds_view.shape)
|
1203 |
+
clip_image_embeds_view = clip_image_embeds_view.view(1, seq_len, -1)
|
1204 |
print('catted viewed: ',clip_image_embeds_view.shape)
|
1205 |
zeros_tensor = torch.zeros_like(clip_image_embeds_view)
|
1206 |
#zeros_tensor = torch.zeros_like(clip_image_embeds_view[:, :image_prompt_embeds_list[0].shape[1], :]) # Make zeros tensor match the sequence length of a single image embedding
|