Update pipeline_stable_diffusion_3_ipa.py
Browse files
pipeline_stable_diffusion_3_ipa.py
CHANGED
@@ -1204,7 +1204,16 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1204 |
#clip_image_embeds = torch.cat([torch.zeros_like(torch.stack(image_prompt_embeds_list)), torch.stack(image_prompt_embeds_list)], dim=0).mean(dim=0)
|
1205 |
# FAILS clip_image_embeds = torch.cat(torch.stack(image_prompt_embeds_list), dim=0).mean(dim=0)
|
1206 |
# FAILS TIMESTEPS clip_image_embeds = torch.cat(image_prompt_embeds_list, dim=0).mean(dim=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1207 |
|
|
|
1208 |
# 1. Stack the image embeddings
|
1209 |
stacked_image_embeds = torch.cat(image_prompt_embeds_list, dim=1)
|
1210 |
print('shape 1: ', stacked_image_embeds.shape)
|
@@ -1214,18 +1223,20 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1214 |
# 3. Create a tensor of zeros with the same shape as the averaged embedding
|
1215 |
zeros_tensor = torch.zeros_like(average_image_embed)
|
1216 |
#print('shape 3: ', zeros_tensor.shape)
|
1217 |
-
|
1218 |
-
print('shape 3.1: ',
|
1219 |
-
|
1220 |
-
print('shape 3.5: ',
|
1221 |
# 4. Concatenate the zeros and the average embedding
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
1228 |
-
|
|
|
|
|
1229 |
'''
|
1230 |
#clip_image_embeds = clip_image_embeds.unsqueeze(0) # Add a dimension at the beginning so now you have [1, 2*seq_len_img, embed_dim_img]
|
1231 |
print('shape 5: ', clip_image_embeds.shape)
|
|
|
1204 |
#clip_image_embeds = torch.cat([torch.zeros_like(torch.stack(image_prompt_embeds_list)), torch.stack(image_prompt_embeds_list)], dim=0).mean(dim=0)
|
1205 |
# FAILS clip_image_embeds = torch.cat(torch.stack(image_prompt_embeds_list), dim=0).mean(dim=0)
|
1206 |
# FAILS TIMESTEPS clip_image_embeds = torch.cat(image_prompt_embeds_list, dim=0).mean(dim=0)
|
1207 |
+
clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list).mean(dim=0)
|
1208 |
+
print('catted with mean shape 1: ', stacked_image_embeds.shape)
|
1209 |
+
clip_image_embeds_stack_list = torch.stack(image_prompt_embeds_list).mean(dim=0)
|
1210 |
+
print('stacked with mean shape 1: ', stacked_image_embeds.shape)
|
1211 |
+
clip_image_embeds_cat_list = torch.cat(image_prompt_embeds_list)
|
1212 |
+
print('catted without mean shape 1: ', stacked_image_embeds.shape)
|
1213 |
+
clip_image_embeds_stack_list = torch.stack(image_prompt_embeds_list)
|
1214 |
+
print('stacked without mean shape 1: ', stacked_image_embeds.shape)
|
1215 |
|
1216 |
+
|
1217 |
# 1. Stack the image embeddings
|
1218 |
stacked_image_embeds = torch.cat(image_prompt_embeds_list, dim=1)
|
1219 |
print('shape 1: ', stacked_image_embeds.shape)
|
|
|
1223 |
# 3. Create a tensor of zeros with the same shape as the averaged embedding
|
1224 |
zeros_tensor = torch.zeros_like(average_image_embed)
|
1225 |
#print('shape 3: ', zeros_tensor.shape)
|
1226 |
+
zeros_tensor_repeat = zeros_tensor.repeat(1, 1, 1)
|
1227 |
+
print('shape 3.1: ', zeros_tensor_repeat.shape)
|
1228 |
+
clip_image_embeds_repeat = average_image_embed.repeat(1, 1, 1)
|
1229 |
+
print('shape 3.5: ', clip_image_embeds_repeat.shape)
|
1230 |
# 4. Concatenate the zeros and the average embedding
|
1231 |
+
clip_image_embeds_cat = torch.cat([zeros_tensor, average_image_embed], dim=0)
|
1232 |
+
print('shape 4: ', clip_image_embeds_cat.shape)
|
1233 |
+
clip_image_embeds_cat_repeat = clip_image_embeds_cat.repeat(1, 1, 1)
|
1234 |
+
print('shape 4.1: ', clip_image_embeds_cat_repeat.shape)
|
1235 |
+
clip_image_embeds_repeat_cat = torch.cat([zeros_tensor_repeat, clip_image_embeds_repeat], dim=0)
|
1236 |
+
print('shape 4a: ', clip_image_embeds_repeat_cat.shape)
|
1237 |
+
clip_image_embeds_repeat_cat_1 = torch.cat([zeros_tensor_repeat, clip_image_embeds_repeat], dim=1)
|
1238 |
+
print('shape 4b: ', clip_image_embeds_repeat_cat_1.shape)
|
1239 |
+
clip_image_embeds = clip_image_embeds_repeat_cat
|
1240 |
'''
|
1241 |
#clip_image_embeds = clip_image_embeds.unsqueeze(0) # Add a dimension at the beginning so now you have [1, 2*seq_len_img, embed_dim_img]
|
1242 |
print('shape 5: ', clip_image_embeds.shape)
|