bardofcodes
/

pattern_analogies

@@ -33,9 +33,337 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineO
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 # REf: https://github.com/tatp22/multidim-positional-encoding/tree/master
-from analogy_encoder import AnalogyEncoder
-from analogy_projector import AnalogyProjector
-from analogy_input_processor import AnalogyInputProcessor
 class PatternAnalogyTrifuser(DiffusionPipeline):
     r"""

 from diffusers.configuration_utils import ConfigMixin, register_to_config
 # REf: https://github.com/tatp22/multidim-positional-encoding/tree/master
+OUT_SIZE = 768
+IN_SIZE = 2048
+DINO_SIZE = 224
+DINO_MEAN = [0.485, 0.456, 0.406]
+DINO_STD = [0.229, 0.224, 0.225]
+SIGLIP_SIZE = 256
+SIGLIP_MEAN = [0.5]
+SIGLIP_STD = [0.5]
+def get_emb(sin_inp):
+    """
+    Gets a base embedding for one dimension with sin and cos intertwined
+    """
+    emb = th.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
+    return th.flatten(emb, -2, -1)
+class PositionalEncoding1D(nn.Module):
+    def __init__(self, channels):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding1D, self).__init__()
+        self.org_channels = channels
+        channels = int(np.ceil(channels / 2) * 2)
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (th.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("cached_penc", None, persistent=False)
+    def forward(self, tensor):
+        """
+        :param tensor: A 3d tensor of size (batch_size, x, ch)
+        :return: Positional Encoding Matrix of size (batch_size, x, ch)
+        """
+        if len(tensor.shape) != 3:
+            raise RuntimeError("The input tensor has to be 3d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        batch_size, x, orig_ch = tensor.shape
+        pos_x = th.arange(x, device=tensor.device, dtype=self.inv_freq.dtype)
+        sin_inp_x = th.einsum("i,j->ij", pos_x, self.inv_freq)
+        emb_x = get_emb(sin_inp_x)
+        emb = th.zeros((x, self.channels), device=tensor.device, dtype=tensor.dtype)
+        emb[:, : self.channels] = emb_x
+        self.cached_penc = emb[None, :, :orig_ch].repeat(batch_size, 1, 1)
+        return self.cached_penc
+class PositionalEncoding3D(nn.Module):
+    def __init__(self, channels):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding3D, self).__init__()
+        self.org_channels = channels
+        channels = int(np.ceil(channels / 6) * 2)
+        if channels % 2:
+            channels += 1
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (th.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("cached_penc", None, persistent=False)
+    def forward(self, tensor):
+        """
+        :param tensor: A 5d tensor of size (batch_size, x, y, z, ch)
+        :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch)
+        """
+        if len(tensor.shape) != 5:
+            raise RuntimeError("The input tensor has to be 5d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        batch_size, x, y, z, orig_ch = tensor.shape
+        pos_x = th.arange(x, device=tensor.device, dtype=self.inv_freq.dtype)
+        pos_y = th.arange(y, device=tensor.device, dtype=self.inv_freq.dtype)
+        pos_z = th.arange(z, device=tensor.device, dtype=self.inv_freq.dtype)
+        sin_inp_x = th.einsum("i,j->ij", pos_x, self.inv_freq)
+        sin_inp_y = th.einsum("i,j->ij", pos_y, self.inv_freq)
+        sin_inp_z = th.einsum("i,j->ij", pos_z, self.inv_freq)
+        emb_x = get_emb(sin_inp_x).unsqueeze(1).unsqueeze(1)
+        emb_y = get_emb(sin_inp_y).unsqueeze(1)
+        emb_z = get_emb(sin_inp_z)
+        emb = th.zeros(
+            (x, y, z, self.channels * 3),
+            device=tensor.device,
+            dtype=tensor.dtype,
+        )
+        emb[:, :, :, : self.channels] = emb_x
+        emb[:, :, :, self.channels : 2 * self.channels] = emb_y
+        emb[:, :, :, 2 * self.channels :] = emb_z
+        self.cached_penc = emb[None, :, :, :, :orig_ch].repeat(batch_size, 1, 1, 1, 1)
+        return self.cached_penc
+class AnalogyInputProcessor(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self,):
+        super(AnalogyInputProcessor, self).__init__()
+        self.dino_transform = transforms.Compose(
+            [
+                transforms.Resize((DINO_SIZE, DINO_SIZE)),
+                transforms.ToTensor(),
+                transforms.Normalize(DINO_MEAN, DINO_STD), # SIGLIP normalization
+            ]
+        )
+        self.siglip_transform = transforms.Compose(
+            [
+                transforms.Resize((SIGLIP_SIZE, SIGLIP_SIZE)),
+                transforms.ToTensor(),
+                transforms.Normalize(SIGLIP_MEAN, SIGLIP_STD), # SIGLIP normalization
+            ]
+        )
+        dino_mean = th.tensor(DINO_MEAN).view(1, 3, 1, 1)
+        dino_std = th.tensor(DINO_STD).view(1, 3, 1, 1)
+        siglip_mean = [SIGLIP_MEAN[0],] * 3
+        siglip_std = [SIGLIP_STD[0],] * 3
+        siglip_mean = th.tensor(siglip_mean).view(1, 3, 1, 1)
+        siglip_std = th.tensor(siglip_std).view(1, 3, 1, 1)
+        self.register_buffer("dino_mean", dino_mean)
+        self.register_buffer("dino_std", dino_std)
+        self.register_buffer("siglip_mean", siglip_mean)
+        self.register_buffer("siglip_std", siglip_std)
+    def __call__(self, analogy_prompt):
+        # List of tuples of (A, A*, B)
+        img_a_dino = []
+        img_a_siglip = []
+        img_a_star_dino = []
+        img_a_star_siglip = []
+        img_b_dino = []
+        img_b_siglip = []
+        for im_set in analogy_prompt:
+            img_a, img_a_star, img_b = im_set
+            img_a_dino.append(self.dino_transform(img_a))
+            img_a_siglip.append(self.siglip_transform(img_a))
+            img_a_star_dino.append(self.dino_transform(img_a_star))
+            img_a_star_siglip.append(self.siglip_transform(img_a_star))
+            img_b_dino.append(self.dino_transform(img_b))
+            img_b_siglip.append(self.siglip_transform(img_b))
+        img_a_dino = th.stack(img_a_dino, 0)
+        img_a_siglip = th.stack(img_a_siglip, 0)
+        img_a_star_dino = th.stack(img_a_star_dino, 0)
+        img_a_star_siglip = th.stack(img_a_star_siglip, 0)
+        img_b_dino = th.stack(img_b_dino, 0)
+        img_b_siglip = th.stack(img_b_siglip, 0)
+        dino_combined_input = th.stack([img_b_dino, img_a_dino, img_a_star_dino], 0)
+        siglip_combined_input = th.stack([img_b_siglip, img_a_siglip, img_a_star_siglip], 0)
+        return dino_combined_input, siglip_combined_input
+    def get_negative(self, dino_in, siglip_in):
+        dino_i = ((dino_in * 0 + 0.5) - self.dino_mean) / self.dino_std
+        siglip_i = ((siglip_in * 0 + 0.5) - self.siglip_mean) / self.siglip_std
+        return dino_i, siglip_i
+class AnalogyProjector(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self):
+        super(AnalogyProjector, self).__init__()
+        self.projector = DinoSiglipMixer()
+        self.pos_embd_1D = PositionalEncoding1D(OUT_SIZE)
+        self.pos_embd_3D = PositionalEncoding3D(OUT_SIZE)
+    def forward(self, dino_in, siglip_in, batch_size):
+        image_embeddings = self.projector(dino_in, siglip_in)
+        image_embeddings = einops.rearrange(image_embeddings, '(k b) t d -> b k t d', b=batch_size)
+        image_embeddings = self.position_embd(image_embeddings)
+        return image_embeddings
+    def position_embd(self, image_embeddings, concat=False):
+        canvas_embd = image_embeddings[:, :, 1:, :]
+        batch_size = canvas_embd.shape[0]
+        type_size = canvas_embd.shape[1]
+        xy_size = canvas_embd.shape[2]
+        x_size = int(xy_size ** 0.5)
+        canvas_embd = canvas_embd.reshape(batch_size, type_size, x_size, x_size, -1)
+        if concat:
+            canvas_embd = th.cat([canvas_embd, self.pos_embd_3D(canvas_embd)], -1)
+        else:
+            canvas_embd = self.pos_embd_3D(canvas_embd) + canvas_embd
+        canvas_embd = canvas_embd.reshape(batch_size, type_size, xy_size, -1)
+        class_embd = image_embeddings[:, :, 0, :]
+        if concat:
+            class_embd = th.cat([class_embd, self.pos_embd_1D(class_embd)], -1)
+        else:
+            class_embd = self.pos_embd_1D(class_embd) + class_embd
+        all_embd_list = []
+        for i in range(type_size):
+            all_embd_list.append(class_embd[:, i:i+1])
+            all_embd_list.append(canvas_embd[:, i])
+        image_embeddings = th.cat(all_embd_list, 1)
+        return image_embeddings
+class HighLowMixer(th.nn.Module):
+    def __init__(self, in_size=IN_SIZE, out_size=OUT_SIZE):
+        super().__init__()
+        mid_size = (in_size + out_size) // 2
+        self.lower_projector = th.nn.Sequential(
+            th.nn.LayerNorm(IN_SIZE//2),
+            th.nn.SiLU()
+        )
+        self.upper_projector = th.nn.Sequential(
+            th.nn.LayerNorm(IN_SIZE//2),
+            th.nn.SiLU()
+        )
+        self.projectors = th.nn.ModuleList([
+            # add layer norm
+            th.nn.Linear(in_size, mid_size),
+            th.nn.SiLU(),
+            th.nn.Linear(mid_size, out_size)
+        ])
+        # initialize
+        for proj in self.projectors:
+            if isinstance(proj, th.nn.Linear):
+                th.nn.init.xavier_uniform_(proj.weight)
+                th.nn.init.zeros_(proj.bias)
+    def forward(self, lower_in, upper_in, ):
+        # ALso format lower_in
+        lower_in = self.lower_projector(lower_in)
+        upper_in = self.upper_projector(upper_in)
+        x = th.cat([lower_in, upper_in], -1)
+        for proj in self.projectors:
+            x = proj(x)
+        return x
+class DinoSiglipMixer(th.nn.Module):
+    def __init__(self, in_size=OUT_SIZE * 2, out_size=OUT_SIZE):
+        super().__init__()
+        self.dino_projector = HighLowMixer()
+        self.siglip_projector = HighLowMixer()
+        self.projectors = th.nn.Sequential(
+            th.nn.SiLU(),
+            th.nn.Linear(in_size, out_size),
+        )
+        # initialize
+        for proj in self.projectors:
+            if isinstance(proj, th.nn.Linear):
+                th.nn.init.xavier_uniform_(proj.weight)
+                th.nn.init.zeros_(proj.bias)
+    def forward(self, dino_in, siglip_in):
+        # ALso format lower_in
+        lower, upper = th.chunk(dino_in, 2, -1)
+        dino_out = self.dino_projector(lower, upper)
+        lower, upper = th.chunk(siglip_in, 2, -1)
+        siglip_out = self.siglip_projector(lower, upper)
+        x = th.cat([dino_out, siglip_out], -1)
+        for proj in self.projectors:
+            x = proj(x)
+        return x
+class AnalogyEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, load_pretrained=False,
+                 dino_config_dict=None, siglip_config_dict=None):
+        super().__init__()
+        if load_pretrained:
+            image_encoder_dino = AutoModel.from_pretrained('facebook/dinov2-large', torch_dtype=th.float16)
+            image_encoder_siglip = SiglipVisionModel.from_pretrained("google/siglip-large-patch16-256", torch_dtype=th.float16, attn_implementation="sdpa")
+        else:
+            image_encoder_dino = AutoModel.from_config(Dinov2Config.from_dict(dino_config_dict))
+            image_encoder_siglip = AutoModel.from_config(SiglipVisionConfig.from_dict(siglip_config_dict))
+        image_encoder_dino.requires_grad_(False)
+        image_encoder_dino = image_encoder_dino.to(memory_format=th.channels_last)
+        image_encoder_siglip.requires_grad_(False)
+        image_encoder_siglip = image_encoder_siglip.to(memory_format=th.channels_last)
+        self.image_encoder_dino = image_encoder_dino
+        self.image_encoder_siglip = image_encoder_siglip
+    def dino_normalization(self, encoder_output):
+        embeds = encoder_output.last_hidden_state
+        embeds_pooled = embeds[:, 0:1]
+        embeds = embeds / th.norm(embeds_pooled, dim=-1, keepdim=True)
+        return embeds
+    def siglip_normalization(self, encoder_output):
+        embeds = th.cat ([encoder_output.pooler_output[:, None, :], encoder_output.last_hidden_state], dim=1)
+        embeds_pooled = embeds[:, 0:1]
+        embeds = embeds / th.norm(embeds_pooled, dim=-1, keepdim=True)
+        return embeds
+    def forward(self, dino_in, siglip_in):
+        x_1 = self.image_encoder_dino(dino_in, output_hidden_states=True)
+        x_1_first = x_1.hidden_states[0]
+        x_1 = self.dino_normalization(x_1)
+        x_2 = self.image_encoder_siglip(siglip_in, output_hidden_states=True)
+        x_2_first = x_2.hidden_states[0]
+        x_2_first_pool = th.mean(x_2_first, dim=1, keepdim=True)
+        x_2_first = th.cat([x_2_first_pool, x_2_first], 1)
+        x_2 = self.siglip_normalization(x_2)
+        dino_embd = th.cat([x_1, x_1_first], -1)
+        siglip_embd = th.cat([x_2, x_2_first], -1)
+        return dino_embd, siglip_embd
 class PatternAnalogyTrifuser(DiffusionPipeline):
     r"""