LHM / engine /BiRefNet /models /birefnet.py
QZFantasies's picture
add wheels
c614b0f
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from huggingface_hub import PyTorchModelHubMixin
from kornia.filters import laplacian
from engine.BiRefNet.config import Config
from engine.BiRefNet.dataset import class_labels_TR_sorted
from .backbones.build_backbone import build_backbone
from .modules.aspp import ASPP, ASPPDeformable
from .modules.decoder_blocks import BasicDecBlk, ResBlk
from .modules.lateral_blocks import BasicLatBlk
from .refinement.refiner import Refiner, RefinerPVTInChannels4, RefUNet
from .refinement.stem_layer import StemLayer
def image2patches(
image,
grid_h=2,
grid_w=2,
patch_ref=None,
transformation="b c (hg h) (wg w) -> (b hg wg) c h w",
):
if patch_ref is not None:
grid_h, grid_w = (
image.shape[-2] // patch_ref.shape[-2],
image.shape[-1] // patch_ref.shape[-1],
)
patches = rearrange(image, transformation, hg=grid_h, wg=grid_w)
return patches
def patches2image(
patches,
grid_h=2,
grid_w=2,
patch_ref=None,
transformation="(b hg wg) c h w -> b c (hg h) (wg w)",
):
if patch_ref is not None:
grid_h, grid_w = (
patch_ref.shape[-2] // patches[0].shape[-2],
patch_ref.shape[-1] // patches[0].shape[-1],
)
image = rearrange(patches, transformation, hg=grid_h, wg=grid_w)
return image
class BiRefNet(
nn.Module,
PyTorchModelHubMixin,
library_name="birefnet",
repo_url="https://github.com/ZhengPeng7/BiRefNet",
tags=[
"Image Segmentation",
"Background Removal",
"Mask Generation",
"Dichotomous Image Segmentation",
"Camouflaged Object Detection",
"Salient Object Detection",
],
):
def __init__(self, bb_pretrained=True):
super(BiRefNet, self).__init__()
self.config = Config()
self.epoch = 1
self.bb = build_backbone(self.config.bb, pretrained=bb_pretrained)
channels = self.config.lateral_channels_in_collection
if self.config.auxiliary_classification:
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.cls_head = nn.Sequential(
nn.Linear(channels[0], len(class_labels_TR_sorted))
)
if self.config.squeeze_block:
self.squeeze_module = nn.Sequential(
*[
eval(self.config.squeeze_block.split("_x")[0])(
channels[0] + sum(self.config.cxt), channels[0]
)
for _ in range(eval(self.config.squeeze_block.split("_x")[1]))
]
)
self.decoder = Decoder(channels)
if self.config.ender:
self.dec_end = nn.Sequential(
nn.Conv2d(1, 16, 3, 1, 1),
nn.Conv2d(16, 1, 3, 1, 1),
nn.ReLU(inplace=True),
)
# refine patch-level segmentation
if self.config.refine:
if self.config.refine == "itself":
self.stem_layer = StemLayer(
in_channels=3 + 1,
inter_channels=48,
out_channels=3,
norm_layer="BN" if self.config.batch_size > 1 else "LN",
)
else:
self.refiner = eval(
"{}({})".format(self.config.refine, "in_channels=3+1")
)
if self.config.freeze_bb:
# Freeze the backbone...
print(self.named_parameters())
for key, value in self.named_parameters():
if "bb." in key and "refiner." not in key:
value.requires_grad = False
def forward_enc(self, x):
if self.config.bb in ["vgg16", "vgg16bn", "resnet50"]:
x1 = self.bb.conv1(x)
x2 = self.bb.conv2(x1)
x3 = self.bb.conv3(x2)
x4 = self.bb.conv4(x3)
else:
x1, x2, x3, x4 = self.bb(x)
if self.config.mul_scl_ipt == "cat":
B, C, H, W = x.shape
x1_, x2_, x3_, x4_ = self.bb(
F.interpolate(
x, size=(H // 2, W // 2), mode="bilinear", align_corners=True
)
)
x1 = torch.cat(
[
x1,
F.interpolate(
x1_, size=x1.shape[2:], mode="bilinear", align_corners=True
),
],
dim=1,
)
x2 = torch.cat(
[
x2,
F.interpolate(
x2_, size=x2.shape[2:], mode="bilinear", align_corners=True
),
],
dim=1,
)
x3 = torch.cat(
[
x3,
F.interpolate(
x3_, size=x3.shape[2:], mode="bilinear", align_corners=True
),
],
dim=1,
)
x4 = torch.cat(
[
x4,
F.interpolate(
x4_, size=x4.shape[2:], mode="bilinear", align_corners=True
),
],
dim=1,
)
elif self.config.mul_scl_ipt == "add":
B, C, H, W = x.shape
x1_, x2_, x3_, x4_ = self.bb(
F.interpolate(
x, size=(H // 2, W // 2), mode="bilinear", align_corners=True
)
)
x1 = x1 + F.interpolate(
x1_, size=x1.shape[2:], mode="bilinear", align_corners=True
)
x2 = x2 + F.interpolate(
x2_, size=x2.shape[2:], mode="bilinear", align_corners=True
)
x3 = x3 + F.interpolate(
x3_, size=x3.shape[2:], mode="bilinear", align_corners=True
)
x4 = x4 + F.interpolate(
x4_, size=x4.shape[2:], mode="bilinear", align_corners=True
)
class_preds = (
self.cls_head(self.avgpool(x4).view(x4.shape[0], -1))
if self.training and self.config.auxiliary_classification
else None
)
if self.config.cxt:
x4 = torch.cat(
(
*[
F.interpolate(
x1, size=x4.shape[2:], mode="bilinear", align_corners=True
),
F.interpolate(
x2, size=x4.shape[2:], mode="bilinear", align_corners=True
),
F.interpolate(
x3, size=x4.shape[2:], mode="bilinear", align_corners=True
),
][-len(self.config.cxt) :],
x4,
),
dim=1,
)
return (x1, x2, x3, x4), class_preds
def forward_ori(self, x):
########## Encoder ##########
(x1, x2, x3, x4), class_preds = self.forward_enc(x)
if self.config.squeeze_block:
x4 = self.squeeze_module(x4)
########## Decoder ##########
features = [x, x1, x2, x3, x4]
if self.training and self.config.out_ref:
features.append(laplacian(torch.mean(x, dim=1).unsqueeze(1), kernel_size=5))
scaled_preds = self.decoder(features)
return scaled_preds, class_preds
def forward(self, x):
scaled_preds, class_preds = self.forward_ori(x)
class_preds_lst = [class_preds]
return [scaled_preds, class_preds_lst] if self.training else scaled_preds
class Decoder(nn.Module):
def __init__(self, channels):
super(Decoder, self).__init__()
self.config = Config()
DecoderBlock = eval(self.config.dec_blk)
LateralBlock = eval(self.config.lat_blk)
if self.config.dec_ipt:
self.split = self.config.dec_ipt_split
N_dec_ipt = 64
DBlock = SimpleConvs
ic = 64
ipt_cha_opt = 1
self.ipt_blk5 = DBlock(
2**10 * 3 if self.split else 3,
[N_dec_ipt, channels[0] // 8][ipt_cha_opt],
inter_channels=ic,
)
self.ipt_blk4 = DBlock(
2**8 * 3 if self.split else 3,
[N_dec_ipt, channels[0] // 8][ipt_cha_opt],
inter_channels=ic,
)
self.ipt_blk3 = DBlock(
2**6 * 3 if self.split else 3,
[N_dec_ipt, channels[1] // 8][ipt_cha_opt],
inter_channels=ic,
)
self.ipt_blk2 = DBlock(
2**4 * 3 if self.split else 3,
[N_dec_ipt, channels[2] // 8][ipt_cha_opt],
inter_channels=ic,
)
self.ipt_blk1 = DBlock(
2**0 * 3 if self.split else 3,
[N_dec_ipt, channels[3] // 8][ipt_cha_opt],
inter_channels=ic,
)
else:
self.split = None
self.decoder_block4 = DecoderBlock(
channels[0]
+ (
[N_dec_ipt, channels[0] // 8][ipt_cha_opt] if self.config.dec_ipt else 0
),
channels[1],
)
self.decoder_block3 = DecoderBlock(
channels[1]
+ (
[N_dec_ipt, channels[0] // 8][ipt_cha_opt] if self.config.dec_ipt else 0
),
channels[2],
)
self.decoder_block2 = DecoderBlock(
channels[2]
+ (
[N_dec_ipt, channels[1] // 8][ipt_cha_opt] if self.config.dec_ipt else 0
),
channels[3],
)
self.decoder_block1 = DecoderBlock(
channels[3]
+ (
[N_dec_ipt, channels[2] // 8][ipt_cha_opt] if self.config.dec_ipt else 0
),
channels[3] // 2,
)
self.conv_out1 = nn.Sequential(
nn.Conv2d(
channels[3] // 2
+ (
[N_dec_ipt, channels[3] // 8][ipt_cha_opt]
if self.config.dec_ipt
else 0
),
1,
1,
1,
0,
)
)
self.lateral_block4 = LateralBlock(channels[1], channels[1])
self.lateral_block3 = LateralBlock(channels[2], channels[2])
self.lateral_block2 = LateralBlock(channels[3], channels[3])
if self.config.ms_supervision:
self.conv_ms_spvn_4 = nn.Conv2d(channels[1], 1, 1, 1, 0)
self.conv_ms_spvn_3 = nn.Conv2d(channels[2], 1, 1, 1, 0)
self.conv_ms_spvn_2 = nn.Conv2d(channels[3], 1, 1, 1, 0)
if self.config.out_ref:
_N = 16
self.gdt_convs_4 = nn.Sequential(
nn.Conv2d(channels[1], _N, 3, 1, 1),
nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(),
nn.ReLU(inplace=True),
)
self.gdt_convs_3 = nn.Sequential(
nn.Conv2d(channels[2], _N, 3, 1, 1),
nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(),
nn.ReLU(inplace=True),
)
self.gdt_convs_2 = nn.Sequential(
nn.Conv2d(channels[3], _N, 3, 1, 1),
nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(),
nn.ReLU(inplace=True),
)
self.gdt_convs_pred_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
self.gdt_convs_pred_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
self.gdt_convs_pred_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
self.gdt_convs_attn_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
self.gdt_convs_attn_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
self.gdt_convs_attn_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
def forward(self, features):
if self.training and self.config.out_ref:
outs_gdt_pred = []
outs_gdt_label = []
x, x1, x2, x3, x4, gdt_gt = features
else:
x, x1, x2, x3, x4 = features
outs = []
if self.config.dec_ipt:
patches_batch = (
image2patches(
x,
patch_ref=x4,
transformation="b c (hg h) (wg w) -> b (c hg wg) h w",
)
if self.split
else x
)
x4 = torch.cat(
(
x4,
self.ipt_blk5(
F.interpolate(
patches_batch,
size=x4.shape[2:],
mode="bilinear",
align_corners=True,
)
),
),
1,
)
p4 = self.decoder_block4(x4)
m4 = (
self.conv_ms_spvn_4(p4)
if self.config.ms_supervision and self.training
else None
)
if self.config.out_ref:
p4_gdt = self.gdt_convs_4(p4)
if self.training:
# >> GT:
m4_dia = m4
gdt_label_main_4 = gdt_gt * F.interpolate(
m4_dia, size=gdt_gt.shape[2:], mode="bilinear", align_corners=True
)
outs_gdt_label.append(gdt_label_main_4)
# >> Pred:
gdt_pred_4 = self.gdt_convs_pred_4(p4_gdt)
outs_gdt_pred.append(gdt_pred_4)
gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
# >> Finally:
p4 = p4 * gdt_attn_4
_p4 = F.interpolate(p4, size=x3.shape[2:], mode="bilinear", align_corners=True)
_p3 = _p4 + self.lateral_block4(x3)
if self.config.dec_ipt:
patches_batch = (
image2patches(
x,
patch_ref=_p3,
transformation="b c (hg h) (wg w) -> b (c hg wg) h w",
)
if self.split
else x
)
_p3 = torch.cat(
(
_p3,
self.ipt_blk4(
F.interpolate(
patches_batch,
size=x3.shape[2:],
mode="bilinear",
align_corners=True,
)
),
),
1,
)
p3 = self.decoder_block3(_p3)
m3 = (
self.conv_ms_spvn_3(p3)
if self.config.ms_supervision and self.training
else None
)
if self.config.out_ref:
p3_gdt = self.gdt_convs_3(p3)
if self.training:
# >> GT:
# m3 --dilation--> m3_dia
# G_3^gt * m3_dia --> G_3^m, which is the label of gradient
m3_dia = m3
gdt_label_main_3 = gdt_gt * F.interpolate(
m3_dia, size=gdt_gt.shape[2:], mode="bilinear", align_corners=True
)
outs_gdt_label.append(gdt_label_main_3)
# >> Pred:
# p3 --conv--BN--> F_3^G, where F_3^G predicts the \hat{G_3} with xx
# F_3^G --sigmoid--> A_3^G
gdt_pred_3 = self.gdt_convs_pred_3(p3_gdt)
outs_gdt_pred.append(gdt_pred_3)
gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
# >> Finally:
# p3 = p3 * A_3^G
p3 = p3 * gdt_attn_3
_p3 = F.interpolate(p3, size=x2.shape[2:], mode="bilinear", align_corners=True)
_p2 = _p3 + self.lateral_block3(x2)
if self.config.dec_ipt:
patches_batch = (
image2patches(
x,
patch_ref=_p2,
transformation="b c (hg h) (wg w) -> b (c hg wg) h w",
)
if self.split
else x
)
_p2 = torch.cat(
(
_p2,
self.ipt_blk3(
F.interpolate(
patches_batch,
size=x2.shape[2:],
mode="bilinear",
align_corners=True,
)
),
),
1,
)
p2 = self.decoder_block2(_p2)
m2 = (
self.conv_ms_spvn_2(p2)
if self.config.ms_supervision and self.training
else None
)
if self.config.out_ref:
p2_gdt = self.gdt_convs_2(p2)
if self.training:
# >> GT:
m2_dia = m2
gdt_label_main_2 = gdt_gt * F.interpolate(
m2_dia, size=gdt_gt.shape[2:], mode="bilinear", align_corners=True
)
outs_gdt_label.append(gdt_label_main_2)
# >> Pred:
gdt_pred_2 = self.gdt_convs_pred_2(p2_gdt)
outs_gdt_pred.append(gdt_pred_2)
gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
# >> Finally:
p2 = p2 * gdt_attn_2
_p2 = F.interpolate(p2, size=x1.shape[2:], mode="bilinear", align_corners=True)
_p1 = _p2 + self.lateral_block2(x1)
if self.config.dec_ipt:
patches_batch = (
image2patches(
x,
patch_ref=_p1,
transformation="b c (hg h) (wg w) -> b (c hg wg) h w",
)
if self.split
else x
)
_p1 = torch.cat(
(
_p1,
self.ipt_blk2(
F.interpolate(
patches_batch,
size=x1.shape[2:],
mode="bilinear",
align_corners=True,
)
),
),
1,
)
_p1 = self.decoder_block1(_p1)
_p1 = F.interpolate(_p1, size=x.shape[2:], mode="bilinear", align_corners=True)
if self.config.dec_ipt:
patches_batch = (
image2patches(
x,
patch_ref=_p1,
transformation="b c (hg h) (wg w) -> b (c hg wg) h w",
)
if self.split
else x
)
_p1 = torch.cat(
(
_p1,
self.ipt_blk1(
F.interpolate(
patches_batch,
size=x.shape[2:],
mode="bilinear",
align_corners=True,
)
),
),
1,
)
p1_out = self.conv_out1(_p1)
if self.config.ms_supervision and self.training:
outs.append(m4)
outs.append(m3)
outs.append(m2)
outs.append(p1_out)
return (
outs
if not (self.config.out_ref and self.training)
else ([outs_gdt_pred, outs_gdt_label], outs)
)
class SimpleConvs(nn.Module):
def __init__(self, in_channels: int, out_channels: int, inter_channels=64) -> None:
super().__init__()
self.conv1 = nn.Conv2d(in_channels, inter_channels, 3, 1, 1)
self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, 1)
def forward(self, x):
return self.conv_out(self.conv1(x))
###########
class BiRefNetC2F(
nn.Module,
PyTorchModelHubMixin,
library_name="birefnet_c2f",
repo_url="https://github.com/ZhengPeng7/BiRefNet_C2F",
tags=[
"Image Segmentation",
"Background Removal",
"Mask Generation",
"Dichotomous Image Segmentation",
"Camouflaged Object Detection",
"Salient Object Detection",
],
):
def __init__(self, bb_pretrained=True):
super(BiRefNetC2F, self).__init__()
self.config = Config()
self.epoch = 1
self.grid = 4
self.model_coarse = BiRefNet(bb_pretrained=True)
self.model_fine = BiRefNet(bb_pretrained=True)
self.input_mixer = nn.Conv2d(4, 3, 1, 1, 0)
self.output_mixer_merge_post = nn.Sequential(
nn.Conv2d(1, 16, 3, 1, 1), nn.Conv2d(16, 1, 3, 1, 1)
)
def forward(self, x):
x_ori = x.clone()
########## Coarse ##########
x = F.interpolate(
x,
size=[s // self.grid for s in self.config.size[::-1]],
mode="bilinear",
align_corners=True,
)
if self.training:
scaled_preds, class_preds_lst = self.model_coarse(x)
else:
scaled_preds = self.model_coarse(x)
########## Fine ##########
x_HR_patches = image2patches(
x_ori, patch_ref=x, transformation="b c (hg h) (wg w) -> (b hg wg) c h w"
)
pred = F.interpolate(
(
scaled_preds[-1]
if not (self.config.out_ref and self.training)
else scaled_preds[1][-1]
),
size=x_ori.shape[2:],
mode="bilinear",
align_corners=True,
)
pred_patches = image2patches(
pred, patch_ref=x, transformation="b c (hg h) (wg w) -> (b hg wg) c h w"
)
t = torch.cat([x_HR_patches, pred_patches], dim=1)
x_HR = self.input_mixer(t)
pred_patches = image2patches(
pred, patch_ref=x_HR, transformation="b c (hg h) (wg w) -> b (c hg wg) h w"
)
if self.training:
scaled_preds_HR, class_preds_lst_HR = self.model_fine(x_HR)
else:
scaled_preds_HR = self.model_fine(x_HR)
if self.training:
if self.config.out_ref:
[outs_gdt_pred, outs_gdt_label], outs = scaled_preds
[outs_gdt_pred_HR, outs_gdt_label_HR], outs_HR = scaled_preds_HR
for idx_out, out_HR in enumerate(outs_HR):
outs_HR[idx_out] = self.output_mixer_merge_post(
patches2image(
out_HR,
grid_h=self.grid,
grid_w=self.grid,
transformation="(b hg wg) c h w -> b c (hg h) (wg w)",
)
)
return [
(
[
outs_gdt_pred + outs_gdt_pred_HR,
outs_gdt_label + outs_gdt_label_HR,
],
outs + outs_HR,
),
class_preds_lst,
] # handle gt here
else:
return [
scaled_preds
+ [
self.output_mixer_merge_post(
patches2image(
scaled_pred_HR,
grid_h=self.grid,
grid_w=self.grid,
transformation="(b hg wg) c h w -> b c (hg h) (wg w)",
)
)
for scaled_pred_HR in scaled_preds_HR
],
class_preds_lst,
]
else:
return scaled_preds + [
self.output_mixer_merge_post(
patches2image(
scaled_pred_HR,
grid_h=self.grid,
grid_w=self.grid,
transformation="(b hg wg) c h w -> b c (hg h) (wg w)",
)
)
for scaled_pred_HR in scaled_preds_HR
]