Spaces:

dbaranchuk
/

instruct-p2p-distill

Runtime error

App Files Files Community

dbaranchuk commited on Jun 30, 2024

Commit

8f3a280

verified ·

1 Parent(s): 3ad0e52

Delete p2p.py

Browse files

Files changed (1) hide show

p2p.py +0 -454

p2p.py DELETED Viewed

@@ -1,454 +0,0 @@
-import torch.nn.functional as nnf
-import torch
-import abc
-import numpy as np
-import seq_aligner
-from typing import Optional, Union, Tuple, List, Callable, Dict
-MAX_NUM_WORDS = 77
-LOW_RESOURCE = False
-NUM_DDIM_STEPS = 50
-device = 'cuda'
-tokenizer = None
-# Different attention controllers
-# ----------------------------------------------------------------------
-class LocalBlend:
-    def get_mask(self, maps, alpha, use_pool, x_t):
-        k = 1
-        maps = (maps * alpha).sum(-1).mean(1)
-        if use_pool:
-            maps = nnf.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
-        mask = nnf.interpolate(maps, size=(x_t.shape[2:]))
-        mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
-        mask = mask.gt(self.th[1 - int(use_pool)])
-        mask = mask[:1] + mask
-        return mask
-    def __call__(self, x_t, attention_store):
-        self.counter += 1
-        if self.counter > self.start_blend:
-            maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
-            maps = [item.reshape(self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS) for item in maps]
-            maps = torch.cat(maps, dim=1)
-            mask = self.get_mask(maps, self.alpha_layers, True, x_t)
-            if self.substruct_layers is not None:
-                maps_sub = ~self.get_mask(maps, self.substruct_layers, False, x_t)
-                mask = mask * maps_sub
-            mask = mask.float()
-            x_t = x_t[:1] + mask * (x_t - x_t[:1])
-        return x_t
-    def __init__(self, prompts: List[str], words: [List[List[str]]], substruct_words=None, start_blend=0.2,
-                 th=(.3, .3)):
-        alpha_layers = torch.zeros(len(prompts), 1, 1, 1, 1, MAX_NUM_WORDS)
-        for i, (prompt, words_) in enumerate(zip(prompts, words)):
-            if type(words_) is str:
-                words_ = [words_]
-            for word in words_:
-                ind = get_word_inds(prompt, word, tokenizer)
-                alpha_layers[i, :, :, :, :, ind] = 1
-        if substruct_words is not None:
-            substruct_layers = torch.zeros(len(prompts), 1, 1, 1, 1, MAX_NUM_WORDS)
-            for i, (prompt, words_) in enumerate(zip(prompts, substruct_words)):
-                if type(words_) is str:
-                    words_ = [words_]
-                for word in words_:
-                    ind = get_word_inds(prompt, word, tokenizer)
-                    substruct_layers[i, :, :, :, :, ind] = 1
-            self.substruct_layers = substruct_layers.to(device)
-        else:
-            self.substruct_layers = None
-        self.alpha_layers = alpha_layers.to(device)
-        self.start_blend = int(start_blend * NUM_DDIM_STEPS)
-        self.counter = 0
-        self.th = th
-class EmptyControl:
-    def step_callback(self, x_t):
-        return x_t
-    def between_steps(self):
-        return
-    def __call__(self, attn, is_cross: bool, place_in_unet: str):
-        return attn
-class AttentionControl(abc.ABC):
-    def step_callback(self, x_t):
-        return x_t
-    def between_steps(self):
-        return
-    @property
-    def num_uncond_att_layers(self):
-        return self.num_att_layers if LOW_RESOURCE else 0
-    @abc.abstractmethod
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        raise NotImplementedError
-    def __call__(self, attn, is_cross: bool, place_in_unet: str):
-        if self.cur_att_layer >= self.num_uncond_att_layers:
-            if LOW_RESOURCE:
-                attn = self.forward(attn, is_cross, place_in_unet)
-            else:
-                h = attn.shape[0]
-                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
-        self.cur_att_layer += 1
-        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
-            self.cur_att_layer = 0
-            self.cur_step += 1
-            self.between_steps()
-        return attn
-    def reset(self):
-        self.cur_step = 0
-        self.cur_att_layer = 0
-    def __init__(self):
-        self.cur_step = 0
-        self.num_att_layers = -1
-        self.cur_att_layer = 0
-class SpatialReplace(EmptyControl):
-    def step_callback(self, x_t):
-        if self.cur_step < self.stop_inject:
-            b = x_t.shape[0]
-            x_t = x_t[:1].expand(b, *x_t.shape[1:])
-        return x_t
-    def __init__(self, stop_inject: float):
-        super(SpatialReplace, self).__init__()
-        self.stop_inject = int((1 - stop_inject) * NUM_DDIM_STEPS)
-class AttentionStore(AttentionControl):
-    @staticmethod
-    def get_empty_store():
-        return {"down_cross": [], "mid_cross": [], "up_cross": [],
-                "down_self": [], "mid_self": [], "up_self": []}
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
-        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
-            self.step_store[key].append(attn)
-        return attn
-    def between_steps(self):
-        if len(self.attention_store) == 0:
-            self.attention_store = self.step_store
-        else:
-            for key in self.attention_store:
-                for i in range(len(self.attention_store[key])):
-                    self.attention_store[key][i] += self.step_store[key][i]
-        self.step_store = self.get_empty_store()
-    def get_average_attention(self):
-        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in
-                             self.attention_store}
-        return average_attention
-    def reset(self):
-        super(AttentionStore, self).reset()
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-    def __init__(self):
-        super(AttentionStore, self).__init__()
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-class AttentionControlEdit(AttentionStore, abc.ABC):
-    def step_callback(self, x_t):
-        if self.local_blend is not None:
-            x_t = self.local_blend(x_t, self.attention_store)
-        return x_t
-    def replace_self_attention(self, attn_base, att_replace, place_in_unet):
-        if att_replace.shape[2] <= 32 ** 2:
-            attn_base = attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
-            return attn_base
-        else:
-            return att_replace
-    @abc.abstractmethod
-    def replace_cross_attention(self, attn_base, att_replace):
-        raise NotImplementedError
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
-        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
-            h = attn.shape[0] // (self.batch_size)
-            attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
-            attn_base, attn_repalce = attn[0], attn[1:]
-            if is_cross:
-                alpha_words = self.cross_replace_alpha[self.cur_step]
-                attn_repalce_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (
-                            1 - alpha_words) * attn_repalce
-                attn[1:] = attn_repalce_new
-            else:
-                attn[1:] = self.replace_self_attention(attn_base, attn_repalce, place_in_unet)
-            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
-        return attn
-    def __init__(self, prompts, num_steps: int,
-                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
-                 self_replace_steps: Union[float, Tuple[float, float]],
-                 local_blend: Optional[LocalBlend]):
-        super(AttentionControlEdit, self).__init__()
-        self.batch_size = len(prompts)
-        self.cross_replace_alpha = get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps,
-                                                                  tokenizer).to(device)
-        if type(self_replace_steps) is float:
-            self_replace_steps = 0, self_replace_steps
-        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
-        self.local_blend = local_blend
-class AttentionReplace(AttentionControlEdit):
-    def replace_cross_attention(self, attn_base, att_replace):
-        return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper)
-    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
-                 local_blend: Optional[LocalBlend] = None):
-        super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).to(device)
-class AttentionRefine(AttentionControlEdit):
-    def replace_cross_attention(self, attn_base, att_replace):
-        attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
-        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
-        # attn_replace = attn_replace / attn_replace.sum(-1, keepdims=True)
-        return attn_replace
-    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
-                 local_blend: Optional[LocalBlend] = None):
-        super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
-        self.mapper, alphas = self.mapper.to(device), alphas.to(device)
-        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
-class AttentionReweight(AttentionControlEdit):
-    def replace_cross_attention(self, attn_base, att_replace):
-        if self.prev_controller is not None:
-            attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace)
-        attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :]
-        # attn_replace = attn_replace / attn_replace.sum(-1, keepdims=True)
-        return attn_replace
-    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, equalizer,
-                 local_blend: Optional[LocalBlend] = None, controller: Optional[AttentionControlEdit] = None):
-        super(AttentionReweight, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps,
-                                                local_blend)
-        self.equalizer = equalizer.to(device)
-        self.prev_controller = controller
-        self.attn = []
-# ----------------------------------------------------------------------
-# Attention controller during sampling
-# ----------------------------------------------------------------------
-def make_controller(prompts: List[str], is_replace_controller: bool, cross_replace_steps: Dict[str, float],
-                    self_replace_steps: float, blend_words=None, equilizer_params=None) -> AttentionControlEdit:
-    if blend_words is None:
-        lb = None
-    else:
-        lb = LocalBlend(prompts, blend_words, start_blend=0.0, th=(0.3, 0.3))
-    if is_replace_controller:
-        controller = AttentionReplace(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
-                                      self_replace_steps=self_replace_steps, local_blend=lb)
-    else:
-        controller = AttentionRefine(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
-                                     self_replace_steps=self_replace_steps, local_blend=lb)
-    if equilizer_params is not None:
-        eq = get_equalizer(prompts[1], equilizer_params["words"], equilizer_params["values"])
-        controller = AttentionReweight(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
-                                       self_replace_steps=self_replace_steps, equalizer=eq, local_blend=lb,
-                                       controller=controller)
-    return controller
-def register_attention_control(model, controller):
-    def ca_forward(self, place_in_unet):
-        to_out = self.to_out
-        if type(to_out) is torch.nn.modules.container.ModuleList:
-            to_out = self.to_out[0]
-        else:
-            to_out = self.to_out
-        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None, ):
-            is_cross = encoder_hidden_states is not None
-            residual = hidden_states
-            if self.spatial_norm is not None:
-                hidden_states = self.spatial_norm(hidden_states, temb)
-            input_ndim = hidden_states.ndim
-            if input_ndim == 4:
-                batch_size, channel, height, width = hidden_states.shape
-                hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-            batch_size, sequence_length, _ = (
-                hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-            )
-            attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            if self.group_norm is not None:
-                hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-            query = self.to_q(hidden_states)
-            if encoder_hidden_states is None:
-                encoder_hidden_states = hidden_states
-            elif self.norm_cross:
-                encoder_hidden_states = self.norm_encoder_hidden_states(encoder_hidden_states)
-            key = self.to_k(encoder_hidden_states)
-            value = self.to_v(encoder_hidden_states)
-            query = self.head_to_batch_dim(query)
-            key = self.head_to_batch_dim(key)
-            value = self.head_to_batch_dim(value)
-            attention_probs = self.get_attention_scores(query, key, attention_mask)
-            attention_probs = controller(attention_probs, is_cross, place_in_unet)
-            hidden_states = torch.bmm(attention_probs, value)
-            hidden_states = self.batch_to_head_dim(hidden_states)
-            # linear proj
-            hidden_states = to_out(hidden_states)
-            if input_ndim == 4:
-                hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-            if self.residual_connection:
-                hidden_states = hidden_states + residual
-            hidden_states = hidden_states / self.rescale_output_factor
-            return hidden_states
-        return forward
-    class DummyController:
-        def __call__(self, *args):
-            return args[0]
-        def __init__(self):
-            self.num_att_layers = 0
-    if controller is None:
-        controller = DummyController()
-    def register_recr(net_, count, place_in_unet):
-        if net_.__class__.__name__ == 'Attention':
-            net_.forward = ca_forward(net_, place_in_unet)
-            return count + 1
-        elif hasattr(net_, 'children'):
-            for net__ in net_.children():
-                count = register_recr(net__, count, place_in_unet)
-        return count
-    cross_att_count = 0
-    sub_nets = model.unet.named_children()
-    for net in sub_nets:
-        if "down" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "down")
-        elif "up" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "up")
-        elif "mid" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "mid")
-    controller.num_att_layers = cross_att_count
-# ----------------------------------------------------------------------
-# Other
-# ----------------------------------------------------------------------
-def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float],
-                                                                                     Tuple[float, ...]]):
-    if type(word_select) is int or type(word_select) is str:
-        word_select = (word_select,)
-    equalizer = torch.ones(1, 77)
-    for word, val in zip(word_select, values):
-        inds = get_word_inds(text, word, tokenizer)
-        equalizer[:, inds] = val
-    return equalizer
-def get_time_words_attention_alpha(prompts, num_steps,
-                                   cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
-                                   tokenizer, max_num_words=77):
-    if type(cross_replace_steps) is not dict:
-        cross_replace_steps = {"default_": cross_replace_steps}
-    if "default_" not in cross_replace_steps:
-        cross_replace_steps["default_"] = (0., 1.)
-    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
-    for i in range(len(prompts) - 1):
-        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
-                                                  i)
-    for key, item in cross_replace_steps.items():
-        if key != "default_":
-            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
-            for i, ind in enumerate(inds):
-                if len(ind) > 0:
-                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
-    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
-    return alpha_time_words
-def get_word_inds(text: str, word_place: int, tokenizer):
-    split_text = text.split(" ")
-    if type(word_place) is str:
-        word_place = [i for i, word in enumerate(split_text) if word_place == word]
-    elif type(word_place) is int:
-        word_place = [word_place]
-    out = []
-    if len(word_place) > 0:
-        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
-        cur_len, ptr = 0, 0
-        for i in range(len(words_encode)):
-            cur_len += len(words_encode[i])
-            if ptr in word_place:
-                out.append(i + 1)
-            if cur_len >= len(split_text[ptr]):
-                ptr += 1
-                cur_len = 0
-    return np.array(out)
-def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int,
-                           word_inds: Optional[torch.Tensor] = None):
-    if type(bounds) is float:
-        bounds = 0, bounds
-    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
-    if word_inds is None:
-        word_inds = torch.arange(alpha.shape[2])
-    alpha[: start, prompt_ind, word_inds] = 0
-    alpha[start: end, prompt_ind, word_inds] = 1
-    alpha[end:, prompt_ind, word_inds] = 0
-    return alpha
-# ----------------------------------------------------------------------