gen6scp's picture
Patched codes for ZeroGPU
d643072
raw
history blame
3.33 kB
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import os
from typing import Optional, Tuple
import torch
from torch import nn
from torch.nn import functional as F
class LiteMLA(nn.Module):
r"""Lightweight multiscale linear attention"""
PAD_VAL = 1
def __init__(
self,
in_dim: int,
out_dim: int,
heads: Optional[int] = None,
heads_ratio: float = 1.0,
dim=32,
kernel_func="relu",
scales: Optional[Tuple[int]] = (5,),
eps=1e-15,
use_bias=False,
norm=(None, "bn2d"),
act=(None, None),
):
heads = heads or int(out_dim // dim * heads_ratio)
super().__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.heads = heads
self.dim = dim
self.scales = scales
self.eps = eps
self.aggreg = None
scales = ()
self.kernel_func = nn.ReLU(inplace=False)
self.qkv = nn.Linear(in_dim, in_dim * 3, bias=use_bias)
self.proj = nn.Linear(out_dim, out_dim)
@torch.cuda.amp.autocast(enabled=os.environ.get("AUTOCAST_LINEAR_ATTN", False) == "true")
def attn_matmul(self, q, k, v: torch.Tensor) -> torch.Tensor:
# lightweight linear attention
q = self.kernel_func(q) # B, h, h_d, N
k = self.kernel_func(k)
use_fp32_attention = getattr(self, "fp32_attention", False) # necessary for NAN loss
if use_fp32_attention:
q, k, v = q.float(), k.float(), v.float()
v = F.pad(v, (0, 0, 0, 1), mode="constant", value=LiteMLA.PAD_VAL)
vk = torch.matmul(v, k)
out = torch.matmul(vk, q)
if out.dtype in [torch.float16, torch.bfloat16]:
out = out.float()
out = out[:, :, :-1] / (out[:, :, -1:] + self.eps)
return out
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, C).permute(0, 2, 3, 1)
# B, 3, C, N --> B, C, N
q, k, v = qkv.unbind(1)
dtype = q.dtype
q = q.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N
k = k.reshape(B, C // self.dim, self.dim, N).transpose(-1, -2) # b, h, N, h_d
v = v.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N
out = self.attn_matmul(q, k, v).to(dtype)
out = out.view(B, C, N).permute(0, 2, 1) # B, N, C
out = self.proj(out)
return out
@property
def module_str(self) -> str:
_str = type(self).__name__ + "("
eps = f"{self.eps:.1E}"
_str += f"i={self.in_dim},o={self.out_dim},h={self.heads},d={self.dim},eps={eps}"
return _str
def __repr__(self):
return f"EPS{self.eps}-" + super().__repr__()