Update pipeline_stable_diffusion_3_ipa.py
Browse files
pipeline_stable_diffusion_3_ipa.py
CHANGED
@@ -1215,24 +1215,28 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1215 |
print('shape 2: ', average_image_embed.shape)
|
1216 |
average_image_embedf = torch.mean(stacked_image_embeds, dim=1).unsqueeze(0) # Add batch dimension after averaging
|
1217 |
print('shape 2a: ', average_image_embedf.shape)
|
1218 |
-
|
1219 |
-
print('shape 2b: ', average_image_embede.shape)
|
1220 |
# 3. Create a tensor of zeros with the same shape as the averaged embedding
|
1221 |
zeros_tensor = torch.zeros_like(average_image_embed)
|
1222 |
print('shape 3: ', zeros_tensor.shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1223 |
# 4. Concatenate the zeros and the average embedding
|
1224 |
clip_image_embeds = torch.cat([zeros_tensor, average_image_embed], dim=0)
|
1225 |
-
clip_image_embeds2 = torch.cat([zeros_tensor, average_image_embede], dim=0)
|
1226 |
print('shape 4: ', clip_image_embeds.shape)
|
1227 |
print('shape 4a: ', clip_image_embeds2.shape)
|
|
|
1228 |
'''
|
1229 |
#clip_image_embeds = clip_image_embeds.unsqueeze(0) # Add a dimension at the beginning so now you have [1, 2*seq_len_img, embed_dim_img]
|
1230 |
print('shape 5: ', clip_image_embeds.shape)
|
1231 |
|
1232 |
bs_embed, seq_len, _ = clip_image_embeds.shape
|
1233 |
-
|
1234 |
-
print('shape 6: ', clip_image_embeds.shape)
|
1235 |
-
|
1236 |
clip_image_embedsa = clip_image_embeds.view(bs_embed, 1, -1)
|
1237 |
print('shape 7: ', clip_image_embedsa.shape)
|
1238 |
clip_image_embedsb = clip_image_embeds.view(seq_len, -1)
|
|
|
1215 |
print('shape 2: ', average_image_embed.shape)
|
1216 |
average_image_embedf = torch.mean(stacked_image_embeds, dim=1).unsqueeze(0) # Add batch dimension after averaging
|
1217 |
print('shape 2a: ', average_image_embedf.shape)
|
1218 |
+
|
|
|
1219 |
# 3. Create a tensor of zeros with the same shape as the averaged embedding
|
1220 |
zeros_tensor = torch.zeros_like(average_image_embed)
|
1221 |
print('shape 3: ', zeros_tensor.shape)
|
1222 |
+
zeros_tensor = torch.zeros_like(average_image_embed)
|
1223 |
+
zeros_tensora = average_image_embed.repeat(1, 1, 1)
|
1224 |
+
print('shape 3.1: ', clip_image_embedsa.shape)
|
1225 |
+
clip_image_embedsa = average_image_embed.repeat(1, 1, 1)
|
1226 |
+
print('shape 3.5: ', clip_image_embedsa.shape)
|
1227 |
+
clip_image_embedse = torch.cat([zeros_tensora, average_image_embeda], dim=0)
|
1228 |
+
print('shape 3.8: ', clip_image_embedse.shape)
|
1229 |
# 4. Concatenate the zeros and the average embedding
|
1230 |
clip_image_embeds = torch.cat([zeros_tensor, average_image_embed], dim=0)
|
|
|
1231 |
print('shape 4: ', clip_image_embeds.shape)
|
1232 |
print('shape 4a: ', clip_image_embeds2.shape)
|
1233 |
+
clip_image_embeds2 = torch.cat([zeros_tensor, average_image_embede], dim=0)
|
1234 |
'''
|
1235 |
#clip_image_embeds = clip_image_embeds.unsqueeze(0) # Add a dimension at the beginning so now you have [1, 2*seq_len_img, embed_dim_img]
|
1236 |
print('shape 5: ', clip_image_embeds.shape)
|
1237 |
|
1238 |
bs_embed, seq_len, _ = clip_image_embeds.shape
|
1239 |
+
|
|
|
|
|
1240 |
clip_image_embedsa = clip_image_embeds.view(bs_embed, 1, -1)
|
1241 |
print('shape 7: ', clip_image_embedsa.shape)
|
1242 |
clip_image_embedsb = clip_image_embeds.view(seq_len, -1)
|