Last commit not found
# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers | |
import numpy as np | |
import torch | |
from torch.nn import functional as F | |
from einops import rearrange | |
def blockdiag_weight_to_dense_weight(weight): | |
""" | |
Argumments: | |
weight: (nblocks, out / nblocks, in / blocks) | |
Return: | |
dense_weight: (out / in) | |
""" | |
return torch.block_diag(*torch.unbind(weight, dim=0)) | |
def blockdiag_multiply_reference(x, weight): | |
""" | |
This implementation is slow but more likely to be correct. | |
Arguments: | |
x: (..., n) | |
weight: (nblocks, q, n / nblocks) | |
Outputs: | |
out: (..., nblocks * q) | |
""" | |
n = x.shape[-1] | |
nblocks, q, p = weight.shape | |
assert nblocks * p == n | |
x_reshaped = rearrange(x, '... (nblocks p) -> ... nblocks p', nblocks=nblocks) | |
return rearrange(torch.einsum('...kp, kqp -> ...kq', x_reshaped, weight), | |
'... nblocks q -> ... (nblocks q)') | |
class BlockdiagMultiply(torch.autograd.Function): | |
"""This is a faster implementation, with careful memory copies for the fastest | |
bmm performance. | |
The backward pass is also written manually with careful memory copies. | |
Arguments: | |
x: (..., n) | |
weight: (nblocks, q, n / nblocks) | |
Outputs: | |
out: (..., nblocks * q) | |
""" | |
def forward(ctx, x, weight): | |
ctx.save_for_backward(x, weight) | |
batch_shape, n = x.shape[:-1], x.shape[-1] | |
batch_dim = np.prod(batch_shape) | |
nblocks, q, p = weight.shape | |
assert nblocks * p == n | |
x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1) | |
out = torch.empty(batch_dim, nblocks, q, device=x.device, dtype=x.dtype).transpose(0, 1) | |
out = torch.bmm(x_reshaped, weight.transpose(-1, -2), out=out).transpose(0, 1) | |
return out.reshape(*batch_shape, nblocks * q) | |
def backward(ctx, dout): | |
x, weight = ctx.saved_tensors | |
batch_shape, n = x.shape[:-1], x.shape[-1] | |
batch_dim = np.prod(batch_shape) | |
nblocks, q, p = weight.shape | |
assert nblocks * p == n | |
dx, dweight = None, None | |
dout_reshaped = dout.reshape(batch_dim, nblocks, q).transpose(0, 1) | |
if ctx.needs_input_grad[0]: | |
dx = torch.empty(batch_dim, nblocks, p, device=x.device, dtype=x.dtype) | |
dx = torch.bmm(dout_reshaped, weight.conj(), | |
out=dx.transpose(0, 1)).transpose(0, 1).reshape(*batch_shape, n) | |
if ctx.needs_input_grad[1]: | |
x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1) | |
dweight = torch.bmm(dout_reshaped.transpose(-1, -2), x_reshaped.conj()) | |
return dx, dweight | |
blockdiag_multiply = BlockdiagMultiply.apply |