File size: 35,260 Bytes
2f23f07 52b383d c2966ec 52b383d d5f2bf5 52b383d d63c6cd d2bf1cc 11e4216 d5f2bf5 11e4216 d5f2bf5 bb64240 cc74fc0 2f23f07 cc74fc0 2f23f07 cc74fc0 2f23f07 b5c625b 94b65fc b5c625b 94b65fc b5c625b 2ba277c 03888cd 2ba277c 0e9f0b7 1e99cb9 af62f38 1e99cb9 5a70ae3 1e99cb9 5a70ae3 ef99204 5a70ae3 ef99204 5a70ae3 2f23f07 5a70ae3 2f23f07 b5c625b 2f23f07 af62f38 e5456b5 af62f38 e5456b5 c240920 e5456b5 af62f38 7633d89 af62f38 bb64240 7a902b1 d5f2bf5 e174116 2bc17ff 136c53f c082871 5a70ae3 6ee80eb 95ec167 2c43fbf 2ba90cf 5a70ae3 6ee80eb e5456b5 60e9bf1 c082871 136c53f 7a902b1 136c53f 5490d43 ab34b0c 7654258 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 |
import gradio as gr
import argparse
import cv2
import imageio
import math
from math import ceil
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
class RelationModuleMultiScale(torch.nn.Module):
def __init__(self, img_feature_dim, num_bottleneck, num_frames):
super(RelationModuleMultiScale, self).__init__()
self.subsample_num = 3
self.img_feature_dim = img_feature_dim
self.scales = [i for i in range(num_frames, 1, -1)]
self.relations_scales = []
self.subsample_scales = []
for scale in self.scales:
relations_scale = self.return_relationset(num_frames, scale)
self.relations_scales.append(relations_scale)
self.subsample_scales.append(min(self.subsample_num, len(relations_scale)))
self.num_frames = num_frames
self.fc_fusion_scales = nn.ModuleList() # high-tech modulelist
for i in range(len(self.scales)):
scale = self.scales[i]
fc_fusion = nn.Sequential(nn.ReLU(), nn.Linear(scale * self.img_feature_dim, num_bottleneck), nn.ReLU())
self.fc_fusion_scales += [fc_fusion]
def forward(self, input):
act_scale_1 = input[:, self.relations_scales[0][0] , :]
act_scale_1 = act_scale_1.view(act_scale_1.size(0), self.scales[0] * self.img_feature_dim)
act_scale_1 = self.fc_fusion_scales[0](act_scale_1)
act_scale_1 = act_scale_1.unsqueeze(1)
act_all = act_scale_1.clone()
for scaleID in range(1, len(self.scales)):
act_relation_all = torch.zeros_like(act_scale_1)
num_total_relations = len(self.relations_scales[scaleID])
num_select_relations = self.subsample_scales[scaleID]
idx_relations_evensample = [int(ceil(i * num_total_relations / num_select_relations)) for i in range(num_select_relations)]
for idx in idx_relations_evensample:
act_relation = input[:, self.relations_scales[scaleID][idx], :]
act_relation = act_relation.view(act_relation.size(0), self.scales[scaleID] * self.img_feature_dim)
act_relation = self.fc_fusion_scales[scaleID](act_relation)
act_relation = act_relation.unsqueeze(1)
act_relation_all += act_relation
act_all = torch.cat((act_all, act_relation_all), 1)
return act_all
def return_relationset(self, num_frames, num_frames_relation):
import itertools
return list(itertools.combinations([i for i in range(num_frames)], num_frames_relation))
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='Sprite', help='datasets')
parser.add_argument('--data_root', default='dataset', help='root directory for data')
parser.add_argument('--num_class', type=int, default=15, help='the number of class for jester dataset')
parser.add_argument('--input_type', default='image', choices=['feature', 'image'], help='the type of input')
parser.add_argument('--src', default='domain_1', help='source domain')
parser.add_argument('--tar', default='domain_2', help='target domain')
parser.add_argument('--num_segments', type=int, default=8, help='the number of frame segment')
parser.add_argument('--backbone', type=str, default="dcgan", choices=['dcgan', 'resnet101', 'I3Dpretrain','I3Dfinetune'], help='backbone')
parser.add_argument('--channels', default=3, type=int, help='input channels for image inputs')
parser.add_argument('--add_fc', default=1, type=int, metavar='M', help='number of additional fc layers (excluding the last fc layer) (e.g. 0, 1, 2)')
parser.add_argument('--fc_dim', type=int, default=1024, help='dimension of added fc')
parser.add_argument('--frame_aggregation', type=str, default='trn', choices=[ 'rnn', 'trn'], help='aggregation of frame features (none if baseline_type is not video)')
parser.add_argument('--dropout_rate', default=0.5, type=float, help='dropout ratio for frame-level feature (default: 0.5)')
parser.add_argument('--f_dim', type=int, default=512, help='dim of f')
parser.add_argument('--z_dim', type=int, default=512, help='dimensionality of z_t')
parser.add_argument('--f_rnn_layers', type=int, default=1, help='number of layers (content lstm)')
parser.add_argument('--use_bn', type=str, default='none', choices=['none', 'AdaBN', 'AutoDIAL'], help='normalization-based methods')
parser.add_argument('--prior_sample', type=str, default='random', choices=['random', 'post'], help='how to sample prior')
parser.add_argument('--batch_size', default=128, type=int, help='-batch size')
parser.add_argument('--use_attn', type=str, default='TransAttn', choices=['none', 'TransAttn', 'general'], help='attention-mechanism')
parser.add_argument('--data_threads', type=int, default=5, help='number of data loading threads')
opt = parser.parse_args(args=[])
class GradReverse(Function):
@staticmethod
def forward(ctx, x, beta):
ctx.beta = beta
return x.view_as(x)
@staticmethod
def backward(ctx, grad_output):
grad_input = grad_output.neg() * ctx.beta
return grad_input, None
class TransferVAE_Video(nn.Module):
def __init__(self, opt):
super(TransferVAE_Video, self).__init__()
self.f_dim = opt.f_dim
self.z_dim = opt.z_dim
self.fc_dim = opt.fc_dim
self.channels = opt.channels
self.input_type = opt.input_type
self.frames = opt.num_segments
self.use_bn = opt.use_bn
self.frame_aggregation = opt.frame_aggregation
self.batch_size = opt.batch_size
self.use_attn = opt.use_attn
self.dropout_rate = opt.dropout_rate
self.num_class = opt.num_class
self.prior_sample = opt.prior_sample
if self.input_type == 'image':
import dcgan_64
self.encoder = dcgan_64.encoder(self.fc_dim, self.channels)
self.decoder = dcgan_64.decoder_woSkip(self.z_dim + self.f_dim, self.channels)
self.fc_output_dim = self.fc_dim
elif self.input_type == 'feature':
if opt.backbone == 'resnet101':
model_backnone = getattr(torchvision.models, opt.backbone)(True) # model_test is only used for getting the dim #
self.input_dim = model_backnone.fc.in_features
elif opt.backbone == 'I3Dpretrain':
self.input_dim = 2048
elif opt.backbone == 'I3Dfinetune':
self.input_dim = 2048
self.add_fc = opt.add_fc
self.enc_fc_layer1 = nn.Linear(self.input_dim, self.fc_dim)
self.dec_fc_layer1 = nn.Linear(self.fc_dim, self.input_dim)
self.fc_output_dim = self.fc_dim
if self.use_bn == 'shared':
self.bn_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_dec_layer1 = nn.BatchNorm1d(self.input_dim)
elif self.use_bn == 'separated':
self.bn_S_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_T_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_S_dec_layer1 = nn.BatchNorm1d(self.input_dim)
self.bn_T_dec_layer1 = nn.BatchNorm1d(self.input_dim)
if self.add_fc > 1:
self.enc_fc_layer2 = nn.Linear(self.fc_dim, self.fc_dim)
self.dec_fc_layer2 = nn.Linear(self.fc_dim, self.fc_dim)
self.fc_output_dim = self.fc_dim
## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
if self.use_bn == 'shared':
self.bn_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
elif self.use_bn == 'separated':
self.bn_S_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_T_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_S_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
self.bn_T_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
if self.add_fc > 2:
self.enc_fc_layer3 = nn.Linear(self.fc_dim, self.fc_dim)
self.dec_fc_layer3 = nn.Linear(self.fc_dim, self.fc_dim)
self.fc_output_dim = self.fc_dim
## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
if self.use_bn == 'shared':
self.bn_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
elif self.use_bn == 'separated':
self.bn_S_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_T_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
self.bn_S_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
self.bn_T_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
self.z_2_out = nn.Linear(self.z_dim + self.f_dim, self.fc_output_dim)
## nonlinearity and dropout
self.relu = nn.LeakyReLU(0.1)
self.dropout_f = nn.Dropout(p=self.dropout_rate)
self.dropout_v = nn.Dropout(p=self.dropout_rate)
# -------------------------------
## Disentangle strcuture
# -------------------------------
#self.hidden_dim = opt.rnn_size
self.hidden_dim = opt.z_dim
self.f_rnn_layers = opt.f_rnn_layers
# Prior of content is a uniform Gaussian and prior of the dynamics is an LSTM
self.z_prior_lstm_ly1 = nn.LSTMCell(self.z_dim, self.hidden_dim)
self.z_prior_lstm_ly2 = nn.LSTMCell(self.hidden_dim, self.hidden_dim)
self.z_prior_mean = nn.Linear(self.hidden_dim, self.z_dim)
self.z_prior_logvar = nn.Linear(self.hidden_dim, self.z_dim)
# POSTERIOR DISTRIBUTION NETWORKS
# content and motion features share one lstm
self.z_lstm = nn.LSTM(self.fc_output_dim, self.hidden_dim, self.f_rnn_layers, bidirectional=True, batch_first=True)
self.f_mean = nn.Linear(self.hidden_dim * 2, self.f_dim)
self.f_logvar = nn.Linear(self.hidden_dim * 2, self.f_dim)
self.z_rnn = nn.RNN(self.hidden_dim * 2, self.hidden_dim, batch_first=True)
# Each timestep is for each z so no reshaping and feature mixing
self.z_mean = nn.Linear(self.hidden_dim, self.z_dim)
self.z_logvar = nn.Linear(self.hidden_dim, self.z_dim)
# -------------------------------
## z_t constraints
# -------------------------------
## adversarial loss for frame features z_t
self.fc_feature_domain_frame = nn.Linear(self.z_dim, self.z_dim)
self.fc_classifier_domain_frame = nn.Linear(self.z_dim, 2)
## #------ aggregate frame-based features (frame feature --> video feature) ------#
if self.frame_aggregation == 'rnn':
self.bilstm = nn.LSTM(self.z_dim, self.z_dim * 2, self.f_rnn_layers, bidirectional=True, batch_first=True)
self.feat_aggregated_dim = self.z_dim * 2
elif self.frame_aggregation == 'trn': # 4. TRN (ECCV 2018) ==> fix segment # for both train/val
self.num_bottleneck = 256 # 256
self.TRN = RelationModuleMultiScale(self.z_dim, self.num_bottleneck, self.frames)
self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck)
self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck)
self.feat_aggregated_dim = self.num_bottleneck
## adversarial loss for video features
self.fc_feature_domain_video = nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim)
self.fc_classifier_domain_video = nn.Linear(self.feat_aggregated_dim, 2)
## adversarial loss for each relation of features
if self.frame_aggregation == 'trn':
self.relation_domain_classifier_all = nn.ModuleList()
for i in range(self.frames-1):
relation_domain_classifier = nn.Sequential(
nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
nn.ReLU(),
nn.Linear(self.feat_aggregated_dim, 2)
)
self.relation_domain_classifier_all += [relation_domain_classifier]
## classifier for action prediction task
self.pred_classifier_video = nn.Linear(self.feat_aggregated_dim, self.num_class)
## classifier for prediction domains
self.fc_feature_domain_latent = nn.Linear(self.f_dim, self.f_dim)
self.fc_classifier_doamin_latent = nn.Linear(self.f_dim, 2)
## attention option
if self.use_attn == 'general':
self.attn_layer = nn.Sequential(
nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
nn.Tanh(),
nn.Linear(self.feat_aggregated_dim, 1)
)
def domain_classifier_frame(self, feat, beta):
feat_fc_domain_frame = GradReverse.apply(feat, beta)
feat_fc_domain_frame = self.fc_feature_domain_frame(feat_fc_domain_frame)
feat_fc_domain_frame = self.relu(feat_fc_domain_frame)
pred_fc_domain_frame = self.fc_classifier_domain_frame(feat_fc_domain_frame)
return pred_fc_domain_frame
def domain_classifier_video(self, feat_video, beta):
feat_fc_domain_video = GradReverse.apply(feat_video, beta)
feat_fc_domain_video = self.fc_feature_domain_video(feat_fc_domain_video)
feat_fc_domain_video = self.relu(feat_fc_domain_video)
pred_fc_domain_video = self.fc_classifier_domain_video(feat_fc_domain_video)
return pred_fc_domain_video
def domain_classifier_latent(self, f):
feat_fc_domain_latent = self.fc_feature_domain_latent(f)
feat_fc_domain_latent = self.relu(feat_fc_domain_latent)
pred_fc_domain_latent = self.fc_classifier_doamin_latent(feat_fc_domain_latent)
return pred_fc_domain_latent
def domain_classifier_relation(self, feat_relation, beta):
pred_fc_domain_relation_video = None
for i in range(len(self.relation_domain_classifier_all)):
feat_relation_single = feat_relation[:,i,:].squeeze(1) # 128x1x256 --> 128x256
feat_fc_domain_relation_single = GradReverse.apply(feat_relation_single, beta) # the same beta for all relations (for now)
pred_fc_domain_relation_single = self.relation_domain_classifier_all[i](feat_fc_domain_relation_single)
if pred_fc_domain_relation_video is None:
pred_fc_domain_relation_video = pred_fc_domain_relation_single.view(-1,1,2)
else:
pred_fc_domain_relation_video = torch.cat((pred_fc_domain_relation_video, pred_fc_domain_relation_single.view(-1,1,2)), 1)
pred_fc_domain_relation_video = pred_fc_domain_relation_video.view(-1,2)
return pred_fc_domain_relation_video
def get_trans_attn(self, pred_domain):
softmax = nn.Softmax(dim=1)
logsoftmax = nn.LogSoftmax(dim=1)
entropy = torch.sum(-softmax(pred_domain) * logsoftmax(pred_domain), 1)
weights = 1 - entropy
return weights
def get_general_attn(self, feat):
num_segments = feat.size()[1]
feat = feat.view(-1, feat.size()[-1]) # reshape features: 128x4x256 --> (128x4)x256
weights = self.attn_layer(feat) # e.g. (128x4)x1
weights = weights.view(-1, num_segments, weights.size()[-1]) # reshape attention weights: (128x4)x1 --> 128x4x1
weights = F.softmax(weights, dim=1) # softmax over segments ==> 128x4x1
return weights
def get_attn_feat_relation(self, feat_fc, pred_domain, num_segments):
if self.use_attn == 'TransAttn':
weights_attn = self.get_trans_attn(pred_domain)
elif self.use_attn == 'general':
weights_attn = self.get_general_attn(feat_fc)
weights_attn = weights_attn.view(-1, num_segments-1, 1).repeat(1,1,feat_fc.size()[-1]) # reshape & repeat weights (e.g. 16 x 4 x 256)
feat_fc_attn = (weights_attn+1) * feat_fc
return feat_fc_attn, weights_attn[:,:,0]
def encode_and_sample_post(self, x):
if isinstance(x, list):
conv_x = self.encoder_frame(x[0])
else:
conv_x = self.encoder_frame(x)
# pass the bidirectional lstm
lstm_out, _ = self.z_lstm(conv_x)
# get f:
backward = lstm_out[:, 0, self.hidden_dim:2 * self.hidden_dim]
frontal = lstm_out[:, self.frames - 1, 0:self.hidden_dim]
lstm_out_f = torch.cat((frontal, backward), dim=1)
f_mean = self.f_mean(lstm_out_f)
f_logvar = self.f_logvar(lstm_out_f)
f_post = self.reparameterize(f_mean, f_logvar, random_sampling=False)
# pass to one direction rnn
features, _ = self.z_rnn(lstm_out)
z_mean = self.z_mean(features)
z_logvar = self.z_logvar(features)
z_post = self.reparameterize(z_mean, z_logvar, random_sampling=False)
if isinstance(x, list):
f_mean_list = [f_mean]
f_post_list = [f_post]
for t in range(1,3,1):
conv_x = self.encoder_frame(x[t])
lstm_out, _ = self.z_lstm(conv_x)
# get f:
backward = lstm_out[:, 0, self.hidden_dim:2 * self.hidden_dim]
frontal = lstm_out[:, self.frames - 1, 0:self.hidden_dim]
lstm_out_f = torch.cat((frontal, backward), dim=1)
f_mean = self.f_mean(lstm_out_f)
f_logvar = self.f_logvar(lstm_out_f)
f_post = self.reparameterize(f_mean, f_logvar, random_sampling=False)
f_mean_list.append(f_mean)
f_post_list.append(f_post)
f_mean = f_mean_list
f_post = f_post_list
# f_mean and f_post are list if triple else not
return f_mean, f_logvar, f_post, z_mean, z_logvar, z_post
def decoder_frame(self,zf):
if self.input_type == 'image':
recon_x = self.decoder(zf)
return recon_x
if self.input_type == 'feature':
zf = self.z_2_out(zf) # batch,frames,(z_dim+f_dim) -> batch,frames,fc_output_dim
zf = self.relu(zf)
if self.add_fc > 2:
zf = self.dec_fc_layer3(zf)
if self.use_bn == 'shared':
zf = self.bn_dec_layer3(zf)
elif self.use_bn == 'separated':
zf_src = self.bn_S_dec_layer3(zf[:self.batchsize,:,:])
zf_tar = self.bn_T_dec_layer3(zf[self.batchsize:,:,:])
zf = torch.cat([zf_src,zf_tar],axis=0)
zf = self.relu(zf)
if self.add_fc > 1:
zf = self.dec_fc_layer2(zf)
if self.use_bn == 'shared':
zf = self.bn_dec_layer2(zf)
elif self.use_bn == 'separated':
zf_src = self.bn_S_dec_layer2(zf[:self.batchsize,:,:])
zf_tar = self.bn_T_dec_layer2(zf[self.batchsize:,:,:])
zf = torch.cat([zf_src,zf_tar],axis=0)
zf = self.relu(zf)
zf = self.dec_fc_layer1(zf)
if self.use_bn == 'shared':
zf = self.bn_dec_layer2(zf)
elif self.use_bn == 'separated':
zf_src = self.bn_S_dec_layer2(zf[:self.batchsize,:,:])
zf_tar = self.bn_T_dec_layer2(zf[self.batchsize:,:,:])
zf = torch.cat([zf_src,zf_tar],axis=0)
recon_x = self.relu(zf)
return recon_x
def encoder_frame(self, x):
if self.input_type == 'image':
# input x is list of length Frames [batchsize, channels, size, size]
# convert it to [batchsize, frames, channels, size, size]
# [batch_size, frames, channels, size, size] to [batch_size * frames, channels, size, size]
x_shape = x.shape
x = x.view(-1, x_shape[-3], x_shape[-2], x_shape[-1])
x_embed = self.encoder(x)[0]
# to [batch_size,frames,embed_dim]
return x_embed.view(x_shape[0], x_shape[1], -1)
if self.input_type == 'feature':
# input is [batchsize, framew, input_dim]
x_embed = self.enc_fc_layer1(x)
## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
if self.use_bn == 'shared':
x_embed = self.bn_enc_layer1(x_embed)
elif self.use_bn == 'separated':
x_embed_src = self.bn_S_enc_layer1(x_embed[:self.batchsize,:,:])
x_embed_tar = self.bn_T_enc_layer1(x_embed[self.batchsize:,:,:])
x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
x_embed = self.relu(x_embed)
if self.add_fc > 1:
x_embed = self.enc_fc_layer2(x_embed)
if self.use_bn == 'shared':
x_embed = self.bn_enc_layer2(x_embed)
elif self.use_bn == 'separated':
x_embed_src = self.bn_S_enc_layer2(x_embed[:self.batchsize,:,:])
x_embed_tar = self.bn_T_enc_layer2(x_embed[self.batchsize:,:,:])
x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
x_embed = self.relu(x_embed)
if self.add_fc > 2:
x_embed = self.enc_fc_layer3(x_embed)
if self.use_bn == 'shared':
x_embed = self.bn_enc_layer3(x_embed)
elif self.use_bn == 'separated':
x_embed_src = self.bn_S_enc_layer3(x_embed[:self.batchsize,:,:])
x_embed_tar = self.bn_T_enc_layer3(x_embed[self.batchsize:,:,:])
x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
x_embed = self.relu(x_embed)
## [batchsize, frame, output_dim]
return x_embed
def reparameterize(self, mean, logvar, random_sampling=True):
# Reparametrization occurs only if random sampling is set to true, otherwise mean is returned
if random_sampling is True:
eps = torch.randn_like(logvar)
std = torch.exp(0.5 * logvar)
z = mean + eps * std
return z
else:
return mean
def sample_z_prior_train(self, z_post, random_sampling=True):
z_out = None # This will ultimately store all z_s in the format [batch_size, frames, z_dim]
z_means = None
z_logvars = None
batch_size = z_post.shape[0]
z_t = torch.zeros(batch_size, self.z_dim).cpu()
h_t_ly1 = torch.zeros(batch_size, self.hidden_dim).cpu()
c_t_ly1 = torch.zeros(batch_size, self.hidden_dim).cpu()
h_t_ly2 = torch.zeros(batch_size, self.hidden_dim).cpu()
c_t_ly2 = torch.zeros(batch_size, self.hidden_dim).cpu()
for i in range(self.frames):
# two layer LSTM and two one-layer FC
h_t_ly1, c_t_ly1 = self.z_prior_lstm_ly1(z_t, (h_t_ly1, c_t_ly1))
h_t_ly2, c_t_ly2 = self.z_prior_lstm_ly2(h_t_ly1, (h_t_ly2, c_t_ly2))
z_mean_t = self.z_prior_mean(h_t_ly2)
z_logvar_t = self.z_prior_logvar(h_t_ly2)
z_prior = self.reparameterize(z_mean_t, z_logvar_t, random_sampling)
if z_out is None:
# If z_out is none it means z_t is z_1, hence store it in the format [batch_size, 1, z_dim]
z_out = z_prior.unsqueeze(1)
z_means = z_mean_t.unsqueeze(1)
z_logvars = z_logvar_t.unsqueeze(1)
else:
# If z_out is not none, z_t is not the initial z and hence append it to the previous z_ts collected in z_out
z_out = torch.cat((z_out, z_prior.unsqueeze(1)), dim=1)
z_means = torch.cat((z_means, z_mean_t.unsqueeze(1)), dim=1)
z_logvars = torch.cat((z_logvars, z_logvar_t.unsqueeze(1)), dim=1)
z_t = z_post[:,i,:]
return z_means, z_logvars, z_out
# If random sampling is true, reparametrization occurs else z_t is just set to the mean
def sample_z(self, batch_size, random_sampling=True):
z_out = None # This will ultimately store all z_s in the format [batch_size, frames, z_dim]
z_means = None
z_logvars = None
# All states are initially set to 0, especially z_0 = 0
z_t = torch.zeros(batch_size, self.z_dim).cpu()
# z_mean_t = torch.zeros(batch_size, self.z_dim)
# z_logvar_t = torch.zeros(batch_size, self.z_dim)
h_t_ly1 = torch.zeros(batch_size, self.hidden_dim).cpu()
c_t_ly1 = torch.zeros(batch_size, self.hidden_dim).cpu()
h_t_ly2 = torch.zeros(batch_size, self.hidden_dim).cpu()
c_t_ly2 = torch.zeros(batch_size, self.hidden_dim).cpu()
for _ in range(self.frames):
# h_t, c_t = self.z_prior_lstm(z_t, (h_t, c_t))
# two layer LSTM and two one-layer FC
h_t_ly1, c_t_ly1 = self.z_prior_lstm_ly1(z_t, (h_t_ly1, c_t_ly1))
h_t_ly2, c_t_ly2 = self.z_prior_lstm_ly2(h_t_ly1, (h_t_ly2, c_t_ly2))
z_mean_t = self.z_prior_mean(h_t_ly2)
z_logvar_t = self.z_prior_logvar(h_t_ly2)
z_t = self.reparameterize(z_mean_t, z_logvar_t, random_sampling)
if z_out is None:
# If z_out is none it means z_t is z_1, hence store it in the format [batch_size, 1, z_dim]
z_out = z_t.unsqueeze(1)
z_means = z_mean_t.unsqueeze(1)
z_logvars = z_logvar_t.unsqueeze(1)
else:
# If z_out is not none, z_t is not the initial z and hence append it to the previous z_ts collected in z_out
z_out = torch.cat((z_out, z_t.unsqueeze(1)), dim=1)
z_means = torch.cat((z_means, z_mean_t.unsqueeze(1)), dim=1)
z_logvars = torch.cat((z_logvars, z_logvar_t.unsqueeze(1)), dim=1)
return z_means, z_logvars, z_out
def forward(self, x, beta):
# beta [beta_relation, beta_video, beta_frame]
f_mean, f_logvar, f_post, z_mean_post, z_logvar_post, z_post = self.encode_and_sample_post(x)
if self.prior_sample == 'random':
z_mean_prior, z_logvar_prior, z_prior = self.sample_z(z_post.size(0),random_sampling=False)
elif self.prior_sample == 'post':
z_mean_prior, z_logvar_prior, z_prior = self.sample_z_prior_train(z_post, random_sampling=False)
if isinstance(f_post, list):
f_expand = f_post[0].unsqueeze(1).expand(-1, self.frames, self.f_dim)
else:
f_expand = f_post.unsqueeze(1).expand(-1, self.frames, self.f_dim)
zf = torch.cat((z_post, f_expand), dim=2) # batch,frames,(z_dim+f_dim)
## reconcstruct x
recon_x = self.decoder_frame(zf)
## For constraints on z_post [batch,frame,z_dim] and f_post [batch,f_dim]
pred_domain_all = [] # list save domain predictions (1) z_post (frame level) (2) each z_post_relation (if trn) (3) z_post (video level) (4)f_post
#1. adversarial on z_post (frame level)
z_post_feat = z_post.view(-1, z_post.size()[-1]) # e.g. 32 x 5 x 2048 --> 160 x 2048
z_post_feat = self.dropout_f(z_post_feat)
pred_fc_domain_frame = self.domain_classifier_frame(z_post_feat, beta[2])
pred_fc_domain_frame = pred_fc_domain_frame.view((z_post.size(0), self.frames) + pred_fc_domain_frame.size()[-1:])
pred_domain_all.append(pred_fc_domain_frame)
#2 adversarial on z_post (video level, relation level if trn is used)
if self.frame_aggregation == 'rnn':
self.bilstm.flatten_parameters()
z_post_video_feat, _ = self.bilstm(z_post)
backward = z_post_video_feat[:, 0, self.z_dim:2 * self.z_dim]
frontal = z_post_video_feat[:, self.frames - 1, 0:self.z_dim]
z_post_video_feat = torch.cat((frontal, backward), dim=1)
pred_fc_domain_relation = []
pred_domain_all.append(pred_fc_domain_relation)
elif self.frame_aggregation == 'trn':
z_post_video_relation = self.TRN(z_post) ## [batch, frame-1, self.feat_aggregated_dim]
# adversarial branch for each relation
pred_fc_domain_relation = self.domain_classifier_relation(z_post_video_relation, beta[0])
pred_domain_all.append(pred_fc_domain_relation.view((z_post.size(0), z_post_video_relation.size()[1]) + pred_fc_domain_relation.size()[-1:]))
# transferable attention
if self.use_attn != 'none': # get the attention weighting
z_post_video_relation_attn, _ = self.get_attn_feat_relation(z_post_video_relation, pred_fc_domain_relation, self.frames)
# sum up relation features (ignore 1-relation)
z_post_video_feat = torch.sum(z_post_video_relation_attn, 1)
z_post_video_feat = self.dropout_v(z_post_video_feat)
pred_fc_domain_video = self.domain_classifier_video(z_post_video_feat, beta[1])
pred_fc_domain_video = pred_fc_domain_video.view((z_post.size(0),) + pred_fc_domain_video.size()[-1:])
pred_domain_all.append(pred_fc_domain_video)
#3. video prediction
pred_video_class = self.pred_classifier_video(z_post_video_feat)
#4. domain prediction on f
if isinstance(f_post, list):
pred_fc_domain_latent = self.domain_classifier_latent(f_post[0])
else:
pred_fc_domain_latent = self.domain_classifier_latent(f_post)
pred_domain_all.append(pred_fc_domain_latent)
return f_mean, f_logvar, f_post, z_mean_post, z_logvar_post, z_post, z_mean_prior, z_logvar_prior, z_prior, recon_x, pred_domain_all, pred_video_class
def name2seq(file_name):
images = []
for frame in range(8):
frame_name = '%d' % (frame)
image_filename = file_name + frame_name + '.png'
image = imageio.imread(image_filename)
images.append(image[:, :, :3])
images = np.asarray(images, dtype='f') / 256.0
images = images.transpose((0, 3, 1, 2))
print(images.shape)
images = torch.Tensor(images).unsqueeze(dim=0)
return images
def display_gif(file_name, save_name):
images = []
for frame in range(8):
frame_name = '%d' % (frame)
image_filename = file_name + frame_name + '.png'
images.append(imageio.imread(image_filename))
gif_filename = 'avatar_source.gif'
return imageio.mimsave(gif_filename, images)
def display_gif_pad(file_name, save_name):
images = []
for frame in range(8):
frame_name = '%d' % (frame)
image_filename = file_name + frame_name + '.png'
image = imageio.imread(image_filename)
image = image[:, :, :3]
image_pad = cv2.copyMakeBorder(image, 0, 0, 125, 125, cv2.BORDER_CONSTANT, value=0)
images.append(image_pad)
return imageio.mimsave(save_name, images)
def display_image(file_name):
image_filename = file_name + '0' + '.png'
print(image_filename)
image = imageio.imread(image_filename)
imageio.imwrite('image.png', image)
def run(domain_source, action_source, hair_source, top_source, bottom_source, domain_target, action_target, hair_target, top_target, bottom_target):
# == Source Avatar ==
# body
body_source = '0'
# hair
if hair_source == "green": hair_source = '0'
elif hair_source == "yellow": hair_source = '2'
elif hair_source == "rose": hair_source = '4'
elif hair_source == "red": hair_source = '7'
elif hair_source == "wine": hair_source = '8'
# top
if top_source == "brown": top_source = '0'
elif top_source == "blue": top_source = '1'
elif top_source == "white": top_source = '2'
# bottom
if bottom_source == "white": bottom_source = '0'
elif bottom_source == "golden": bottom_source = '1'
elif bottom_source == "red": bottom_source = '2'
elif bottom_source == "silver": bottom_source = '3'
file_name_source = './Sprite/frames/domain_1/' + action_source + '/'
file_name_source = file_name_source + 'front' + '_' + str(body_source) + str(bottom_source) + str(top_source) + str(hair_source) + '_'
gif = display_gif_pad(file_name_source, 'avatar_source.gif')
# == Target Avatar ==
# body
body_target = '1'
# hair
if hair_target == "violet": hair_target = '1'
elif hair_target == "silver": hair_target = '3'
elif hair_target == "purple": hair_target = '5'
elif hair_target == "grey": hair_target = '6'
elif hair_target == "golden": hair_target = '9'
# top
if top_target == "grey": top_target = '3'
elif top_target == "khaki": top_target = '4'
elif top_target == "linen": top_target = '5'
elif top_target == "ocre": top_target = '6'
# bottom
if bottom_target == "denim": bottom_target = '4'
elif bottom_target == "olive": bottom_target = '5'
elif bottom_target == "brown": bottom_target = '6'
file_name_target = './Sprite/frames/domain_2/' + action_target + '/'
file_name_target = file_name_target + 'front' + '_' + str(body_target) + str(bottom_target) + str(top_target) + str(hair_target) + '_'
gif_target = display_gif_pad(file_name_target, 'avatar_target.gif')
# == Load Model ==
model = TransferVAE_Video(opt)
model.load_state_dict(torch.load('TransferVAE.pth.tar', map_location=torch.device('cpu'))['state_dict'])
model.eval()
return 'demo.gif'
gr.Interface(
run,
inputs=[
gr.Textbox(value="Source Avatar - Human", interactive=False),
gr.Radio(choices=["slash", "spellcard", "walk"], value="slash"),
gr.Radio(choices=["green", "yellow", "rose", "red", "wine"], value="green"),
gr.Radio(choices=["brown", "blue", "white"], value="brown"),
gr.Radio(choices=["white", "golden", "red", "silver"], value="white"),
gr.Textbox(value="Target Avatar - Alien", interactive=False),
gr.Radio(choices=["slash", "spellcard", "walk"], value="walk"),
gr.Radio(choices=["violet", "silver", "purple", "grey", "golden"], value="golden"),
gr.Radio(choices=["grey", "khaki", "linen", "ocre"], value="ocre"),
gr.Radio(choices=["denim", "olive", "brown"], value="brown"),
],
outputs=[
gr.components.Image(type="file", label="Domain Disentanglement"),
],
live=True,
title="TransferVAE for Unsupervised Video Domain Adaptation",
).launch()
|