dreambooth-dog-1
/
diffusers
/examples
/wuerstchen
/text_to_image
/modeling_efficient_net_encoder.py
import torch.nn as nn | |
from torchvision.models import efficientnet_v2_l, efficientnet_v2_s | |
from diffusers.configuration_utils import ConfigMixin, register_to_config | |
from diffusers.models.modeling_utils import ModelMixin | |
class EfficientNetEncoder(ModelMixin, ConfigMixin): | |
def __init__(self, c_latent=16, c_cond=1280, effnet="efficientnet_v2_s"): | |
super().__init__() | |
if effnet == "efficientnet_v2_s": | |
self.backbone = efficientnet_v2_s(weights="DEFAULT").features | |
else: | |
self.backbone = efficientnet_v2_l(weights="DEFAULT").features | |
self.mapper = nn.Sequential( | |
nn.Conv2d(c_cond, c_latent, kernel_size=1, bias=False), | |
nn.BatchNorm2d(c_latent), # then normalize them to have mean 0 and std 1 | |
) | |
def forward(self, x): | |
return self.mapper(self.backbone(x)) | |