diff --git "a/KAIR/experiments/003_train_vrt_videosr_bi_vimeo_7frames/train.log" "b/KAIR/experiments/003_train_vrt_videosr_bi_vimeo_7frames/train.log" new file mode 100644--- /dev/null +++ "b/KAIR/experiments/003_train_vrt_videosr_bi_vimeo_7frames/train.log" @@ -0,0 +1,10958 @@ +22-03-11 09:56:26.486 : task: 003_train_vrt_videosr_bi_vimeo_7frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: model_zoo/vrt/002_VRT_videosr_bi_REDS_16frames.pth + pretrained_netE: None + task: experiments/003_train_vrt_videosr_bi_vimeo_7frames + log: experiments/003_train_vrt_videosr_bi_vimeo_7frames + options: experiments/003_train_vrt_videosr_bi_vimeo_7frames/options + models: experiments/003_train_vrt_videosr_bi_vimeo_7frames/models + images: experiments/003_train_vrt_videosr_bi_vimeo_7frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainVimeoDataset + dataroot_gt: trainsets/vimeo90k + dataroot_lq: trainsets/vimeo90k + meta_info_file: data/meta_info/meta_info_Vimeo90K_train_GT.txt + io_backend:[ + type: file + ] + num_frame: -1 + gt_size: 256 + interval_list: [1] + random_reverse: True + use_hflip: True + use_rot: True + pad_sequence: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/Vid4/GT + dataroot_lq: testsets/Vid4/BIx4 + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [8, 64, 64] + window_size: [8, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 4 + deformable_groups: 16 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: False + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 32 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/003_train_vrt_videosr_bi_vimeo_7frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 09:56:26.522 : Number of train images: 64,612, iters: 8,077 +22-03-11 10:10:27.405 : task: 003_train_vrt_videosr_bi_vimeo_7frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: model_zoo/vrt/002_VRT_videosr_bi_REDS_16frames.pth + pretrained_netE: None + task: experiments/003_train_vrt_videosr_bi_vimeo_7frames + log: experiments/003_train_vrt_videosr_bi_vimeo_7frames + options: experiments/003_train_vrt_videosr_bi_vimeo_7frames/options + models: experiments/003_train_vrt_videosr_bi_vimeo_7frames/models + images: experiments/003_train_vrt_videosr_bi_vimeo_7frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainVimeoDataset + dataroot_gt: trainsets/vimeo90k + dataroot_lq: trainsets/vimeo90k + meta_info_file: data/meta_info/meta_info_Vimeo90K_train_GT.txt + io_backend:[ + type: file + ] + num_frame: -1 + gt_size: 256 + interval_list: [1] + random_reverse: True + use_hflip: True + use_rot: True + pad_sequence: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/Vid4/GT + dataroot_lq: testsets/Vid4/BIx4 + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [8, 64, 64] + window_size: [8, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 4 + deformable_groups: 16 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: False + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 32 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/003_train_vrt_videosr_bi_vimeo_7frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:10:27.440 : Number of train images: 64,612, iters: 8,077 +22-03-11 10:10:31.005 : +Networks name: VRT +Params number: 32577991 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:10:31.165 : + | mean | min | max | std || shape + | 0.000 | -1.496 | 1.623 | 0.115 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | -0.005 | -1.075 | 0.916 | 0.274 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.656 | 0.699 | 0.067 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.037 | -0.877 | 0.359 | 0.346 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.007 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.063 | -1.264 | 0.752 | 0.323 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.158 | -0.704 | 0.861 | 0.357 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.779 | -1.061 | 1.164 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.148 | 0.161 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.002 | -0.000 | 0.004 | 0.003 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.745 | 0.760 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.019 | -0.848 | 0.359 | 0.331 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.373 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.037 | -1.227 | 0.720 | 0.303 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.009 | -4.425 | 0.539 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.158 | -0.758 | 0.988 | 0.386 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.647 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.777 | -1.211 | 1.152 | 0.550 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | 0.000 | -0.126 | 0.144 | 0.017 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | 0.004 | 0.001 | 0.008 | 0.005 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.938 | 0.872 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.028 | -1.086 | 0.552 | 0.435 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.203 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.022 | -1.298 | 0.715 | 0.312 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.010 | -1.806 | 0.627 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.118 | -0.698 | 0.750 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.014 | -1.277 | 0.337 | 0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.684 | -1.730 | 0.954 | 0.648 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.031 | 0.042 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.010 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.956 | 0.847 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.049 | -1.175 | 0.652 | 0.477 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.892 | 1.180 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.021 | -1.294 | 0.764 | 0.316 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.793 | 0.556 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.123 | -0.717 | 0.737 | 0.335 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.102 | 0.291 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.650 | -1.838 | 0.913 | 0.669 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.032 | 0.039 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.000 | -0.012 | 0.012 | 0.017 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.953 | 0.855 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.009 | -1.001 | 0.584 | 0.427 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.054 | 1.223 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.023 | -1.315 | 0.884 | 0.326 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.786 | 0.534 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.142 | -0.698 | 0.780 | 0.342 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.957 | 0.276 | 0.057 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.653 | -1.854 | 0.943 | 0.677 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.035 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.010 | 0.008 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.918 | 0.865 | 0.087 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.047 | -0.824 | 0.510 | 0.392 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.094 | 1.213 | 0.118 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.029 | -1.319 | 0.938 | 0.330 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.007 | -1.794 | 0.519 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.145 | -0.725 | 0.830 | 0.349 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.008 | -0.766 | 0.275 | 0.052 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.659 | -1.945 | 0.999 | 0.706 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.025 | 0.026 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.014 | 0.001 | 0.027 | 0.018 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.335 | 0.614 | 2.324 | 0.313 | torch.Size([120]) || stage1.reshape.1.weight + | -0.007 | -0.451 | 0.392 | 0.149 | torch.Size([120]) || stage1.reshape.1.bias + | 0.640 | 0.164 | 1.487 | 0.258 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.072 | -1.225 | 0.558 | 0.260 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.295 | -4.200 | 2.891 | 0.402 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | 0.001 | -0.736 | 0.771 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.412 | 0.503 | 0.106 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.711 | 0.595 | 0.091 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.006 | -0.195 | 0.530 | 0.097 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.076 | 1.181 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.228 | 0.294 | 0.059 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.836 | 0.408 | 1.248 | 0.162 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.042 | -0.494 | 0.495 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.003 | -0.889 | 0.982 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | 0.041 | -0.364 | 0.458 | 0.117 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.757 | 0.882 | 0.140 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.400 | 0.470 | 0.157 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.852 | 1.093 | 0.139 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | 0.022 | -0.265 | 0.384 | 0.096 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.894 | 0.195 | 1.588 | 0.211 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.156 | -1.734 | 0.260 | 0.208 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.433 | -4.335 | 2.455 | 0.555 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | -0.001 | -1.631 | 1.615 | 0.174 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | 0.005 | -0.246 | 0.392 | 0.072 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.697 | 0.574 | 0.098 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | 0.011 | -0.191 | 0.529 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.260 | 1.186 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.207 | 0.162 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.725 | 0.421 | 0.899 | 0.072 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.043 | -0.750 | 0.403 | 0.161 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | -0.001 | -0.950 | 0.899 | 0.146 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.001 | -0.381 | 0.301 | 0.092 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.615 | 0.630 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.009 | -0.473 | 0.647 | 0.131 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | 0.001 | -0.789 | 0.813 | 0.146 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | -0.041 | -0.335 | 0.331 | 0.119 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 1.087 | 0.163 | 1.663 | 0.218 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.188 | -1.539 | 0.134 | 0.175 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.505 | -4.230 | 3.070 | 0.545 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | -0.000 | -1.348 | 1.453 | 0.171 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | 0.007 | -0.394 | 0.633 | 0.080 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | 0.001 | -0.561 | 0.466 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | 0.028 | -0.263 | 0.277 | 0.111 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.982 | 1.268 | 0.124 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.139 | 0.149 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.743 | 0.234 | 0.925 | 0.092 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.030 | -1.015 | 0.440 | 0.156 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | -0.002 | -0.956 | 1.234 | 0.155 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | 0.003 | -0.419 | 0.302 | 0.108 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.723 | 0.609 | 0.143 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.362 | 0.529 | 0.129 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.768 | 0.645 | 0.147 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | -0.033 | -0.281 | 0.244 | 0.100 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.076 | 0.178 | 1.503 | 0.199 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.153 | -1.699 | 0.096 | 0.171 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.815 | -4.386 | 4.546 | 0.797 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.001 | -2.332 | 2.215 | 0.164 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.004 | -0.455 | 0.400 | 0.070 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.504 | 0.556 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.006 | -0.339 | 0.365 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -1.444 | 1.191 | 0.122 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.162 | 0.140 | 0.029 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.715 | 0.229 | 0.865 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.026 | -1.011 | 0.287 | 0.151 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | -0.003 | -0.761 | 0.828 | 0.148 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | 0.014 | -0.337 | 0.418 | 0.135 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.716 | 0.712 | 0.149 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | 0.003 | -0.427 | 0.369 | 0.124 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.719 | 0.640 | 0.151 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | -0.010 | -0.557 | 0.227 | 0.103 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.161 | 0.188 | 1.556 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.165 | -1.773 | 0.054 | 0.186 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.575 | -3.741 | 5.261 | 0.767 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.000 | -2.020 | 2.251 | 0.173 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.000 | -0.318 | 0.312 | 0.071 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.463 | 0.456 | 0.112 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.406 | 0.393 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.968 | 1.330 | 0.123 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.152 | 0.176 | 0.030 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.699 | 0.230 | 0.850 | 0.073 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.029 | -1.033 | 0.300 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.002 | -0.718 | 0.803 | 0.145 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | 0.002 | -0.389 | 0.405 | 0.139 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | -0.001 | -0.582 | 0.624 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | 0.003 | -0.385 | 0.386 | 0.118 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.677 | 0.737 | 0.153 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | 0.003 | -0.671 | 0.208 | 0.108 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.067 | 0.173 | 1.473 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.129 | -1.487 | 0.138 | 0.166 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.530 | -3.629 | 3.705 | 0.621 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | 0.000 | -2.344 | 1.768 | 0.157 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.428 | 0.265 | 0.082 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | -0.001 | -0.541 | 0.559 | 0.120 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | 0.031 | -0.324 | 0.379 | 0.133 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | -0.001 | -1.380 | 0.992 | 0.120 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.100 | 0.111 | 0.027 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.637 | 0.273 | 0.780 | 0.064 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.022 | -1.160 | 0.338 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.002 | -0.696 | 0.638 | 0.139 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | 0.007 | -0.366 | 0.364 | 0.134 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.001 | -0.581 | 0.657 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | -0.004 | -0.366 | 0.244 | 0.105 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -1.143 | 0.787 | 0.154 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | 0.023 | -1.254 | 0.407 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.293 | 0.270 | 0.065 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.006 | -0.209 | 0.382 | 0.093 | torch.Size([120]) || stage1.linear1.bias + | 0.811 | 0.432 | 1.092 | 0.108 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.033 | -0.763 | 0.477 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.049 | -2.996 | 1.734 | 0.246 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.847 | 1.215 | 0.150 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.542 | 0.581 | 0.147 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.536 | 0.569 | 0.124 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.004 | -0.195 | 0.602 | 0.102 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.568 | 0.438 | 0.872 | 0.074 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.025 | -0.782 | 0.342 | 0.164 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.003 | -0.601 | 0.699 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.068 | -0.329 | 0.446 | 0.095 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | 0.001 | -0.807 | 0.710 | 0.143 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.585 | 0.392 | 0.117 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.779 | 0.575 | 0.142 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | 0.008 | -0.377 | 0.374 | 0.159 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.942 | 0.411 | 1.171 | 0.093 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.038 | -0.837 | 0.321 | 0.152 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.077 | -2.150 | 2.175 | 0.237 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.750 | 0.771 | 0.159 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.589 | 0.559 | 0.145 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.478 | 0.525 | 0.125 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.338 | 0.449 | 0.154 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.597 | 0.429 | 0.741 | 0.044 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.038 | -0.697 | 0.195 | 0.103 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.003 | -0.671 | 0.636 | 0.135 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.057 | -0.519 | 0.422 | 0.139 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.629 | 0.607 | 0.153 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.279 | 0.403 | 0.083 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.001 | -0.620 | 0.712 | 0.150 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | 0.014 | -0.721 | 0.333 | 0.163 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.504 | 0.343 | 0.079 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.015 | -0.276 | 0.353 | 0.122 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.151 | 0.136 | 0.025 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.087 | 0.103 | 0.030 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.004 | -0.024 | 0.040 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.122 | 0.123 | 0.017 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.009 | -0.068 | 0.068 | 0.028 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.175 | 0.114 | 0.015 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | 0.019 | -0.059 | 0.110 | 0.042 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage1.pa_deform.conv_offset.6.bias + | -0.001 | -1.034 | 1.208 | 0.150 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.085 | -0.220 | 0.682 | 0.164 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.305 | 1.408 | 0.167 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.005 | -0.474 | 0.521 | 0.147 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.941 | 0.939 | 0.158 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | 0.019 | -0.993 | 0.852 | 0.371 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.099 | 0.165 | 1.669 | 0.285 | torch.Size([480]) || stage2.reshape.1.weight + | -0.009 | -0.723 | 0.825 | 0.237 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.767 | 0.672 | 0.163 | torch.Size([120, 480]) || stage2.reshape.2.weight + | -0.007 | -0.473 | 0.285 | 0.116 | torch.Size([120]) || stage2.reshape.2.bias + | 0.665 | 0.267 | 1.019 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.152 | -0.897 | 0.303 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.208 | -1.940 | 4.459 | 0.383 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.653 | 0.613 | 0.127 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.263 | 0.270 | 0.066 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | 0.002 | -0.796 | 0.596 | 0.108 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.008 | -0.955 | 0.285 | 0.127 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -1.099 | 0.979 | 0.109 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.131 | 0.090 | 0.022 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.548 | 0.301 | 0.671 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.744 | 0.803 | 0.231 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.645 | 0.555 | 0.133 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | 0.013 | -0.406 | 0.272 | 0.097 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.622 | 0.666 | 0.147 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.002 | -0.228 | 0.307 | 0.085 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.834 | 0.822 | 0.149 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.009 | -0.948 | 0.446 | 0.159 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.777 | 0.311 | 1.104 | 0.161 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.178 | -0.966 | 0.822 | 0.247 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.387 | -2.000 | 5.826 | 0.443 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.662 | 0.706 | 0.132 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | -0.006 | -0.348 | 0.306 | 0.079 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | -0.001 | -0.595 | 0.730 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.001 | -0.811 | 0.531 | 0.167 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -1.007 | 1.002 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.180 | 0.108 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.599 | 0.282 | 0.730 | 0.059 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | -0.004 | -0.671 | 0.938 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.536 | 0.570 | 0.134 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.022 | -0.540 | 0.226 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.646 | 0.589 | 0.149 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.008 | -0.203 | 0.282 | 0.092 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -1.052 | 0.649 | 0.150 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.581 | 0.467 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.780 | 0.134 | 1.161 | 0.193 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.152 | -0.996 | 1.042 | 0.227 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.186 | -2.565 | 4.152 | 0.428 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | 0.001 | -0.856 | 0.814 | 0.151 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.367 | 0.317 | 0.074 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.656 | 0.730 | 0.131 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | -0.003 | -0.555 | 0.620 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -2.191 | 2.575 | 0.137 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.121 | 0.139 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.640 | 0.297 | 0.797 | 0.064 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | -0.013 | -0.584 | 0.934 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.523 | 0.556 | 0.136 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.035 | -0.490 | 0.217 | 0.117 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.679 | 0.601 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.287 | 0.308 | 0.098 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.576 | 0.584 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.423 | 0.376 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.776 | 0.134 | 1.030 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.167 | -0.870 | 1.066 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.259 | -1.735 | 5.189 | 0.366 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.292 | 1.255 | 0.149 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.493 | 0.445 | 0.101 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.618 | 0.582 | 0.122 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.001 | -0.543 | 0.420 | 0.166 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | 0.002 | -2.296 | 2.630 | 0.162 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.130 | 0.149 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.625 | 0.301 | 0.772 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | -0.015 | -0.498 | 0.992 | 0.198 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.620 | 0.681 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.006 | -0.391 | 0.256 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.575 | 0.669 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.225 | 0.333 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.680 | 0.639 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.011 | -0.549 | 0.259 | 0.139 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.933 | 0.310 | 1.186 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.180 | -0.736 | 1.168 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.164 | -2.965 | 4.145 | 0.437 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.860 | 0.749 | 0.136 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | 0.005 | -0.274 | 0.308 | 0.080 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.648 | 0.681 | 0.129 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.547 | 0.295 | 0.149 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.647 | 0.577 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.138 | 0.125 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.635 | 0.329 | 0.748 | 0.049 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.375 | 0.891 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.603 | 0.497 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.010 | -0.345 | 0.297 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.680 | 0.679 | 0.153 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.200 | 0.251 | 0.086 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.568 | 0.614 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.009 | -0.375 | 0.493 | 0.135 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.870 | 0.315 | 1.059 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.139 | -0.657 | 1.107 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.156 | -4.167 | 4.651 | 0.340 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.701 | 0.871 | 0.134 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.427 | 0.471 | 0.099 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.520 | 0.546 | 0.113 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | -0.008 | -0.360 | 0.350 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.510 | 0.502 | 0.100 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.125 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.597 | 0.345 | 0.691 | 0.044 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | -0.015 | -0.367 | 0.987 | 0.132 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.552 | 0.532 | 0.128 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.009 | -0.336 | 0.253 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.644 | 0.758 | 0.154 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.243 | 0.264 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.667 | 0.621 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | -0.002 | -0.447 | 1.139 | 0.183 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.002 | -0.268 | 0.331 | 0.066 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.005 | -0.338 | 0.589 | 0.128 | torch.Size([120]) || stage2.linear1.bias + | 0.939 | 0.517 | 1.207 | 0.113 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.023 | -0.770 | 0.614 | 0.238 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.004 | -3.112 | 1.341 | 0.140 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.605 | 0.580 | 0.136 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.591 | 0.477 | 0.112 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.645 | 0.613 | 0.150 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.031 | -0.422 | 0.330 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.684 | 0.501 | 0.807 | 0.061 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.018 | -0.693 | 0.412 | 0.181 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.559 | 0.715 | 0.125 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | 0.031 | -0.346 | 0.273 | 0.108 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.744 | 0.559 | 0.146 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.239 | 0.270 | 0.080 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.603 | 0.871 | 0.144 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.317 | 0.303 | 0.122 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 0.974 | 0.575 | 1.211 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.023 | -0.703 | 0.556 | 0.208 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.012 | -2.867 | 1.552 | 0.185 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.743 | 0.663 | 0.142 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.647 | 0.654 | 0.141 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.610 | 0.648 | 0.151 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | -0.028 | -0.565 | 0.416 | 0.167 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.742 | 0.522 | 0.891 | 0.076 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.020 | -0.506 | 0.335 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.512 | 0.123 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.094 | -0.405 | 0.617 | 0.174 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.618 | 0.596 | 0.149 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.001 | -0.276 | 0.202 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.668 | 0.769 | 0.148 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.014 | -0.729 | 0.410 | 0.187 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.309 | 0.381 | 0.079 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.017 | -0.403 | 0.399 | 0.133 | torch.Size([120]) || stage2.linear2.bias + | -0.000 | -0.111 | 0.126 | 0.024 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | 0.001 | -0.031 | 0.055 | 0.017 | torch.Size([120]) || stage2.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.021 | 0.012 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | -0.001 | -0.113 | 0.096 | 0.020 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.010 | -0.089 | 0.087 | 0.032 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.079 | 0.087 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.015 | -0.134 | 0.121 | 0.058 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage2.pa_deform.conv_offset.6.bias + | 0.004 | -1.011 | 1.138 | 0.150 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.151 | -0.228 | 0.674 | 0.167 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | 0.001 | -0.988 | 1.066 | 0.144 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | 0.009 | -0.418 | 0.533 | 0.127 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.784 | 0.831 | 0.151 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.007 | -0.581 | 0.470 | 0.257 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.105 | 0.504 | 1.774 | 0.248 | torch.Size([480]) || stage3.reshape.1.weight + | -0.006 | -0.633 | 0.736 | 0.296 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.682 | 0.687 | 0.168 | torch.Size([120, 480]) || stage3.reshape.2.weight + | -0.004 | -0.207 | 0.227 | 0.086 | torch.Size([120]) || stage3.reshape.2.bias + | 0.735 | 0.431 | 0.997 | 0.127 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.162 | -0.753 | 0.303 | 0.198 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.490 | 0.344 | 0.037 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.333 | 0.350 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | -0.004 | -0.195 | 0.128 | 0.039 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.359 | 0.365 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | -0.002 | -0.216 | 0.262 | 0.084 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.597 | 0.657 | 0.058 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.115 | 0.118 | 0.020 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.594 | 0.414 | 0.775 | 0.069 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.260 | 0.315 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.446 | 0.536 | 0.116 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.077 | -0.361 | 0.145 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.507 | 0.503 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.225 | 0.207 | 0.062 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.553 | 0.493 | 0.129 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | -0.006 | -0.268 | 0.158 | 0.085 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.716 | 0.376 | 0.965 | 0.119 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.185 | -0.732 | 0.209 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | -0.002 | -0.462 | 1.414 | 0.064 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.383 | 0.438 | 0.060 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.229 | 0.157 | 0.044 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.357 | 0.478 | 0.065 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | -0.004 | -0.280 | 0.216 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.471 | 0.517 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.112 | 0.131 | 0.022 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.633 | 0.486 | 0.778 | 0.057 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.350 | 0.280 | 0.107 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.513 | 0.512 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.081 | -0.274 | 0.096 | 0.071 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.548 | 0.533 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | -0.003 | -0.181 | 0.194 | 0.059 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.499 | 0.534 | 0.128 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.282 | 0.152 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.796 | 0.469 | 1.007 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.109 | -0.638 | 0.181 | 0.146 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.004 | -1.009 | 1.155 | 0.105 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.378 | 0.375 | 0.081 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.263 | 0.331 | 0.066 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.485 | 0.366 | 0.074 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.249 | 0.145 | 0.080 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.332 | 0.421 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.001 | -0.098 | 0.083 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.657 | 0.507 | 0.776 | 0.053 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.270 | 0.280 | 0.104 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.445 | 0.556 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.097 | -0.295 | 0.100 | 0.070 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.480 | 0.501 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.148 | 0.191 | 0.060 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.569 | 0.484 | 0.126 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.246 | 0.161 | 0.082 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.814 | 0.482 | 1.048 | 0.109 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.138 | -0.585 | 0.128 | 0.129 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | -0.008 | -1.801 | 4.148 | 0.110 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | -0.001 | -0.364 | 0.546 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.179 | 0.182 | 0.046 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.378 | 0.385 | 0.070 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | -0.005 | -0.368 | 0.175 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.338 | 0.461 | 0.062 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.098 | 0.082 | 0.019 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.676 | 0.526 | 0.799 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.269 | 0.242 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.474 | 0.505 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.095 | -0.247 | 0.071 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.518 | 0.502 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.003 | -0.194 | 0.228 | 0.068 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.502 | 0.499 | 0.124 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | -0.007 | -0.248 | 0.207 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.843 | 0.498 | 1.046 | 0.099 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.456 | 0.195 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | -0.012 | -3.133 | 2.263 | 0.177 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.001 | -0.494 | 0.443 | 0.096 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.492 | 0.329 | 0.088 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.464 | 0.391 | 0.080 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.420 | 0.332 | 0.124 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | 0.001 | -0.469 | 0.518 | 0.068 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.068 | 0.099 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.705 | 0.598 | 0.823 | 0.047 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.161 | 0.155 | 0.065 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.526 | 0.442 | 0.119 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.102 | -0.319 | 0.054 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.555 | 0.499 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | -0.003 | -0.201 | 0.135 | 0.065 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | 0.001 | -0.454 | 0.522 | 0.122 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | -0.011 | -0.379 | 0.195 | 0.091 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.856 | 0.618 | 1.073 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.059 | -0.368 | 0.153 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | -0.006 | -1.747 | 1.724 | 0.133 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.399 | 0.417 | 0.090 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | 0.009 | -0.294 | 0.398 | 0.079 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.345 | 0.341 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | -0.004 | -0.435 | 0.326 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.370 | 0.339 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.059 | 0.060 | 0.012 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.707 | 0.600 | 0.832 | 0.051 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.157 | 0.140 | 0.063 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.473 | 0.464 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.091 | -0.291 | 0.092 | 0.073 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.479 | 0.477 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.197 | 0.180 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.504 | 0.440 | 0.118 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | -0.008 | -0.449 | 0.421 | 0.135 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.331 | 0.524 | 0.083 | torch.Size([120, 120]) || stage3.linear1.weight + | -0.001 | -0.270 | 0.250 | 0.116 | torch.Size([120]) || stage3.linear1.bias + | 0.883 | 0.354 | 1.107 | 0.120 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.011 | -0.416 | 0.299 | 0.131 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.322 | 0.139 | 0.028 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.470 | 0.455 | 0.097 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.007 | -0.384 | 0.374 | 0.125 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.467 | 0.428 | 0.109 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.348 | 0.279 | 0.126 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.873 | 0.618 | 1.060 | 0.070 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.242 | 0.278 | 0.098 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.549 | 0.437 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.053 | -0.174 | 0.127 | 0.058 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.469 | 0.517 | 0.124 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.133 | 0.187 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.548 | 0.557 | 0.125 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | -0.011 | -0.339 | 0.303 | 0.116 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.960 | 0.744 | 1.153 | 0.095 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.004 | -0.302 | 0.238 | 0.099 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.567 | 0.133 | 0.032 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.425 | 0.414 | 0.087 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | 0.001 | -0.419 | 0.485 | 0.116 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.429 | 0.385 | 0.095 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.398 | 0.287 | 0.123 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 0.909 | 0.770 | 1.090 | 0.066 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.204 | 0.175 | 0.073 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.451 | 0.462 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.069 | -0.268 | 0.143 | 0.077 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.488 | 0.602 | 0.126 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.004 | -0.179 | 0.114 | 0.050 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.480 | 0.466 | 0.118 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.007 | -0.358 | 0.225 | 0.102 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.274 | 0.457 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.532 | 0.438 | 0.200 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.098 | 0.115 | 0.025 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | 0.002 | -0.033 | 0.041 | 0.015 | torch.Size([120]) || stage3.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.010 | -0.030 | 0.017 | 0.010 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.000 | -0.078 | 0.069 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.006 | -0.055 | 0.067 | 0.026 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | -0.001 | -0.071 | 0.067 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | 0.004 | -0.070 | 0.113 | 0.042 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage3.pa_deform.conv_offset.6.bias + | 0.004 | -0.623 | 0.669 | 0.126 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.092 | -0.221 | 0.676 | 0.151 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | 0.000 | -0.604 | 0.689 | 0.125 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.008 | -0.544 | 0.379 | 0.118 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | 0.000 | -0.669 | 0.719 | 0.151 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.005 | -0.411 | 0.443 | 0.155 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 1.005 | 0.488 | 1.503 | 0.166 | torch.Size([480]) || stage4.reshape.1.weight + | 0.001 | -0.316 | 0.358 | 0.118 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.486 | 0.450 | 0.084 | torch.Size([120, 480]) || stage4.reshape.2.weight + | -0.007 | -0.139 | 0.092 | 0.043 | torch.Size([120]) || stage4.reshape.2.bias + | 0.996 | 0.831 | 1.101 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.109 | 0.112 | 0.040 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.064 | 0.064 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.109 | 0.107 | 0.023 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.001 | -0.033 | 0.029 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.256 | 0.235 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.007 | -0.099 | 0.227 | 0.051 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.129 | 0.142 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.035 | 0.029 | 0.006 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.966 | 0.869 | 1.089 | 0.041 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.155 | 0.152 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.248 | 0.221 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | -0.002 | -0.066 | 0.012 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.287 | 0.219 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.085 | 0.067 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.256 | 0.235 | 0.025 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.009 | -0.123 | 0.254 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 0.988 | 0.825 | 1.079 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.013 | -0.123 | 0.105 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.081 | 0.078 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.133 | 0.170 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.053 | 0.048 | 0.014 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.177 | 0.174 | 0.031 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | 0.008 | -0.099 | 0.204 | 0.048 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.138 | 0.130 | 0.026 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.061 | 0.059 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.996 | 0.943 | 1.081 | 0.026 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.001 | -0.064 | 0.051 | 0.027 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.336 | 0.268 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.029 | 0.028 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.223 | 0.272 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.084 | 0.037 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.207 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.007 | -0.140 | 0.216 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.855 | 1.108 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.019 | -0.115 | 0.091 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.063 | 0.076 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.190 | 0.179 | 0.027 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.001 | -0.043 | 0.039 | 0.011 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.158 | 0.161 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | 0.008 | -0.118 | 0.164 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.213 | 0.211 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.043 | 0.040 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.903 | 1.099 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.097 | 0.106 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.186 | 0.177 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.068 | 0.045 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.307 | 0.185 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | -0.000 | -0.081 | 0.061 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.195 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.008 | -0.115 | 0.161 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 0.997 | 0.893 | 1.071 | 0.032 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.019 | -0.083 | 0.047 | 0.024 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.076 | 0.073 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.275 | 0.259 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.071 | 0.066 | 0.017 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.166 | 0.157 | 0.028 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | 0.008 | -0.105 | 0.149 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.184 | 0.197 | 0.028 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.042 | 0.050 | 0.008 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.001 | 0.971 | 1.136 | 0.022 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | -0.002 | -0.054 | 0.050 | 0.023 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.329 | 0.210 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.078 | 0.029 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.234 | 0.241 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.024 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.169 | 0.164 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.007 | -0.085 | 0.114 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.901 | 1.099 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.034 | -0.095 | 0.039 | 0.030 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.071 | 0.090 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.238 | 0.268 | 0.034 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.199 | 0.144 | 0.030 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.167 | 0.218 | 0.029 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | 0.008 | -0.089 | 0.140 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.267 | 0.253 | 0.031 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.067 | 0.069 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.004 | 0.953 | 1.056 | 0.014 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | -0.001 | -0.056 | 0.077 | 0.021 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.170 | 0.184 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.037 | 0.027 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.149 | 0.202 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.059 | 0.095 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.145 | 0.181 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | 0.006 | -0.086 | 0.117 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 0.996 | 0.859 | 1.077 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.153 | 0.009 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.087 | 0.083 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.249 | 0.266 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.199 | 0.168 | 0.031 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.156 | 0.142 | 0.027 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.102 | 0.145 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.299 | 0.376 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.034 | 0.066 | 0.007 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.992 | 0.924 | 1.097 | 0.025 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.002 | -0.089 | 0.074 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.192 | 0.208 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | -0.002 | -0.064 | 0.021 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.240 | 0.191 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.040 | 0.044 | 0.008 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.141 | 0.155 | 0.022 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.107 | 0.103 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.286 | 0.303 | 0.059 | torch.Size([120, 120]) || stage4.linear1.weight + | -0.012 | -0.311 | 0.190 | 0.090 | torch.Size([120]) || stage4.linear1.bias + | 1.009 | 0.926 | 1.101 | 0.028 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.036 | 0.048 | 0.015 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.071 | 0.076 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.135 | 0.141 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.023 | 0.021 | 0.007 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.115 | 0.121 | 0.025 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.200 | 0.098 | 0.043 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.002 | 0.999 | 1.016 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.082 | 0.094 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | 0.000 | -0.005 | 0.017 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.010 | 0.008 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.090 | 0.105 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.181 | 0.096 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.006 | 0.923 | 1.098 | 0.025 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.045 | 0.053 | 0.019 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.083 | 0.085 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.132 | 0.133 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.030 | 0.035 | 0.009 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.129 | 0.094 | 0.024 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | -0.008 | -0.218 | 0.116 | 0.048 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.003 | 0.999 | 1.024 | 0.003 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.126 | 0.080 | 0.021 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | 0.001 | -0.006 | 0.016 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.015 | 0.013 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.091 | 0.115 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | -0.006 | -0.196 | 0.090 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.291 | 0.416 | 0.059 | torch.Size([120, 120]) || stage4.linear2.weight + | -0.009 | -0.269 | 0.198 | 0.094 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.053 | 0.057 | 0.019 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | -0.001 | -0.021 | 0.021 | 0.009 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.000 | -0.015 | 0.015 | 0.009 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | -0.000 | -0.039 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.000 | -0.030 | 0.029 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | -0.000 | -0.045 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.031 | 0.030 | 0.016 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage4.pa_deform.conv_offset.6.bias + | -0.000 | -0.356 | 0.435 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.003 | -0.080 | 0.304 | 0.033 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.361 | 0.436 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | -0.001 | -0.166 | 0.299 | 0.032 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | -0.000 | -0.748 | 0.752 | 0.056 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.000 | -0.262 | 0.270 | 0.086 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.980 | 0.710 | 1.274 | 0.146 | torch.Size([30]) || stage5.reshape.1.weight + | -0.002 | -0.062 | 0.057 | 0.036 | torch.Size([30]) || stage5.reshape.1.bias + | 0.001 | -0.530 | 0.432 | 0.092 | torch.Size([120, 30]) || stage5.reshape.2.weight + | 0.021 | -0.305 | 0.337 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 0.994 | 0.934 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.040 | 0.038 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.082 | 0.072 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.078 | 0.101 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.022 | 0.023 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.198 | 0.237 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.003 | -0.067 | 0.082 | 0.027 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.103 | 0.092 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.991 | 0.929 | 1.004 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.001 | -0.009 | 0.014 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.112 | 0.093 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | -0.001 | -0.033 | 0.027 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.098 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.033 | 0.026 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.163 | 0.140 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.003 | -0.060 | 0.110 | 0.032 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 0.992 | 0.872 | 1.010 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.015 | -0.039 | 0.031 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.088 | 0.099 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.030 | 0.030 | 0.006 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.151 | 0.185 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | -0.005 | -0.073 | 0.061 | 0.024 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.093 | 0.089 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.997 | 0.923 | 1.003 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.008 | 0.009 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.082 | 0.092 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.023 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.082 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.028 | 0.025 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.090 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.062 | 0.102 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.845 | 1.015 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.018 | -0.045 | 0.016 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.065 | 0.068 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.088 | 0.113 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.022 | 0.020 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.124 | 0.124 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.061 | 0.049 | 0.020 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.087 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.005 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.847 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.014 | 0.015 | 0.007 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.096 | 0.096 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | 0.001 | -0.038 | 0.027 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.090 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.045 | 0.039 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.153 | 0.130 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.097 | 0.083 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 0.984 | 0.798 | 1.006 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.018 | -0.042 | 0.003 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.074 | 0.214 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.133 | 0.132 | 0.022 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.035 | 0.037 | 0.008 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.121 | 0.123 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.043 | 0.049 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.082 | 0.093 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.993 | 0.809 | 1.008 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.001 | -0.018 | 0.013 | 0.006 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.100 | 0.097 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.038 | 0.045 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.104 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.043 | 0.040 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.108 | 0.121 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | 0.002 | -0.066 | 0.048 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 0.988 | 0.835 | 1.035 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.022 | -0.052 | 0.003 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.086 | 0.118 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.199 | 0.223 | 0.023 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.045 | 0.028 | 0.009 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.114 | 0.143 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.060 | 0.047 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.117 | 0.102 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.994 | 0.774 | 1.007 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.023 | 0.027 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.085 | 0.107 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | 0.003 | -0.044 | 0.042 | 0.013 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.103 | 0.080 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.067 | 0.058 | 0.015 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.096 | 0.103 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.045 | 0.054 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 0.985 | 0.552 | 1.092 | 0.044 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.023 | -0.073 | 0.024 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.080 | 0.121 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | -0.000 | -1.776 | 0.186 | 0.026 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.070 | 0.065 | 0.015 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.230 | 0.359 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | -0.001 | -0.062 | 0.079 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.086 | 0.104 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.976 | 0.863 | 0.995 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.037 | 0.053 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.121 | 0.100 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | 0.009 | -0.074 | 0.101 | 0.021 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.102 | 0.101 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.092 | 0.082 | 0.028 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.148 | 0.202 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.056 | 0.054 | 0.025 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.139 | 0.123 | 0.024 | torch.Size([120, 120]) || stage5.linear1.weight + | 0.022 | -0.317 | 0.336 | 0.081 | torch.Size([120]) || stage5.linear1.bias + | 0.963 | 0.765 | 1.026 | 0.058 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.315 | 0.286 | 0.078 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.159 | 0.119 | 0.022 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.038 | 0.044 | 0.013 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.134 | 0.126 | 0.024 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.005 | -0.263 | 0.230 | 0.060 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.990 | 0.913 | 1.001 | 0.017 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.009 | 0.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.077 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.004 | -0.025 | 0.016 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.073 | 0.090 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.018 | 0.018 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.264 | 0.273 | 0.056 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.976 | 0.733 | 1.048 | 0.053 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.265 | 0.241 | 0.061 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.079 | 0.081 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.145 | 0.145 | 0.023 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.031 | 0.051 | 0.009 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.114 | 0.103 | 0.025 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.166 | 0.119 | 0.032 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.993 | 0.939 | 1.001 | 0.012 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.011 | 0.008 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.002 | -0.026 | 0.020 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.020 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | -0.016 | -0.224 | 0.158 | 0.041 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.244 | 0.248 | 0.044 | torch.Size([120, 120]) || stage5.linear2.weight + | 0.022 | -0.367 | 0.377 | 0.103 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.153 | 0.112 | 0.022 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | -0.004 | -0.061 | 0.053 | 0.023 | torch.Size([120]) || stage5.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.022 | 0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.081 | 0.076 | 0.020 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | -0.008 | -0.062 | 0.031 | 0.021 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | -0.000 | -0.080 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.057 | 0.035 | 0.020 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.590 | 0.536 | 0.063 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.075 | -0.075 | 0.431 | 0.094 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | 0.000 | -0.704 | 0.718 | 0.064 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | 0.005 | -0.308 | 0.337 | 0.073 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | 0.000 | -0.702 | 0.735 | 0.101 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.005 | -0.422 | 0.451 | 0.157 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.444 | 1.141 | 1.615 | 0.121 | torch.Size([30]) || stage6.reshape.1.weight + | -0.003 | -0.150 | 0.115 | 0.074 | torch.Size([30]) || stage6.reshape.1.bias + | 0.001 | -0.848 | 0.822 | 0.232 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.004 | -0.514 | 0.640 | 0.181 | torch.Size([120]) || stage6.reshape.2.bias + | 0.557 | 0.119 | 0.895 | 0.153 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.070 | -0.374 | 0.181 | 0.100 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | 0.001 | -0.438 | 0.141 | 0.054 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.339 | 0.306 | 0.051 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.005 | -0.318 | 0.257 | 0.059 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.473 | 0.491 | 0.061 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.330 | 0.253 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.361 | 0.307 | 0.045 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.044 | 0.053 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.521 | 0.121 | 0.882 | 0.143 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.212 | 0.271 | 0.104 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.360 | 0.360 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.095 | -0.280 | 0.021 | 0.059 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.354 | 0.331 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.005 | -0.196 | 0.129 | 0.048 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.486 | 0.379 | 0.080 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.001 | -0.154 | 0.154 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.587 | 0.200 | 0.865 | 0.122 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.118 | -0.374 | 0.082 | 0.089 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.423 | 0.140 | 0.050 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.315 | 0.354 | 0.057 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.184 | 0.148 | 0.047 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.626 | 0.422 | 0.060 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | 0.004 | -0.234 | 0.187 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.692 | 0.743 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.038 | 0.041 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.590 | 0.287 | 0.942 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | -0.006 | -0.196 | 0.203 | 0.076 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.427 | 0.431 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.080 | -0.242 | 0.033 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.293 | 0.362 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | 0.001 | -0.171 | 0.207 | 0.047 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.423 | 0.467 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.152 | 0.184 | 0.057 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.703 | 0.255 | 1.008 | 0.132 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.125 | -0.342 | 0.042 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.381 | 0.350 | 0.052 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.426 | 0.500 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | -0.003 | -0.262 | 0.226 | 0.054 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.299 | 0.325 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.149 | 0.096 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.406 | 0.391 | 0.055 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.055 | 0.085 | 0.015 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.666 | 0.308 | 0.942 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | -0.005 | -0.203 | 0.265 | 0.086 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.349 | 0.494 | 0.072 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.071 | -0.213 | 0.071 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.294 | 0.408 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.120 | 0.147 | 0.049 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.303 | 0.304 | 0.073 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | -0.005 | -0.150 | 0.129 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.702 | 0.307 | 0.960 | 0.129 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.100 | -0.262 | 0.057 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.501 | 0.290 | 0.062 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.349 | 0.336 | 0.061 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.287 | 0.202 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.322 | 0.401 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.004 | -0.182 | 0.151 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.441 | 0.444 | 0.054 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.038 | 0.033 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.666 | 0.317 | 0.970 | 0.117 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | -0.003 | -0.173 | 0.168 | 0.067 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.354 | 0.408 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.072 | -0.297 | 0.067 | 0.065 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.299 | 0.335 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | -0.004 | -0.191 | 0.136 | 0.060 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.400 | 0.590 | 0.071 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | -0.005 | -0.159 | 0.142 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.730 | 0.334 | 0.963 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.064 | -0.201 | 0.064 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.702 | 1.180 | 0.086 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.483 | 0.398 | 0.073 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | 0.004 | -0.480 | 0.514 | 0.080 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.331 | 0.390 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | -0.004 | -0.141 | 0.167 | 0.050 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.387 | 0.470 | 0.048 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.065 | 0.039 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.656 | 0.235 | 0.874 | 0.105 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.005 | -0.237 | 0.171 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.440 | 0.483 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.076 | -0.347 | 0.110 | 0.076 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.286 | 0.348 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.001 | -0.189 | 0.169 | 0.069 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.398 | 0.336 | 0.075 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | -0.004 | -0.127 | 0.137 | 0.052 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.691 | 0.178 | 0.975 | 0.116 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.042 | -0.137 | 0.099 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.662 | 1.078 | 0.078 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.359 | 0.531 | 0.072 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.293 | 0.311 | 0.075 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.426 | 0.488 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | -0.006 | -0.103 | 0.159 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.401 | 0.385 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.039 | 0.043 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.607 | 0.210 | 0.802 | 0.094 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | -0.004 | -0.178 | 0.199 | 0.068 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.377 | 0.541 | 0.079 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.069 | -0.429 | 0.280 | 0.096 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.394 | 0.344 | 0.077 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.241 | 0.223 | 0.085 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.527 | 0.647 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | -0.006 | -0.126 | 0.157 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.001 | -0.294 | 0.287 | 0.060 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.006 | -0.543 | 0.664 | 0.193 | torch.Size([120]) || stage6.linear1.bias + | 0.674 | 0.222 | 1.065 | 0.154 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.480 | 0.311 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.629 | 0.461 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.495 | 0.440 | 0.085 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.516 | 0.468 | 0.114 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.369 | 0.377 | 0.085 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | -0.003 | -0.297 | 0.292 | 0.113 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.644 | 0.181 | 1.104 | 0.153 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | 0.003 | -0.167 | 0.185 | 0.070 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.383 | 0.534 | 0.087 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.101 | -0.214 | 0.048 | 0.051 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.350 | 0.560 | 0.085 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.159 | 0.138 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.374 | 0.488 | 0.091 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.271 | 0.252 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.663 | 0.353 | 0.959 | 0.106 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.314 | 0.289 | 0.089 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.772 | 0.763 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.495 | 0.604 | 0.086 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.005 | -0.491 | 0.401 | 0.097 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.380 | 0.376 | 0.076 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.007 | -0.321 | 0.234 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.666 | 0.226 | 1.153 | 0.138 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | 0.001 | -0.178 | 0.220 | 0.069 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.514 | 0.608 | 0.090 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.132 | -0.313 | 0.023 | 0.059 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.423 | 0.488 | 0.088 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.153 | 0.122 | 0.053 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.399 | 0.435 | 0.087 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.285 | 0.241 | 0.093 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.308 | 0.365 | 0.070 | torch.Size([120, 120]) || stage6.linear2.weight + | -0.002 | -0.699 | 0.757 | 0.303 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.130 | 0.129 | 0.027 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | -0.001 | -0.051 | 0.045 | 0.018 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.049 | 0.026 | 0.012 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.090 | 0.114 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.008 | -0.070 | 0.060 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.001 | -0.097 | 0.101 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.006 | -0.096 | 0.114 | 0.044 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage6.pa_deform.conv_offset.6.bias + | -0.002 | -0.822 | 0.740 | 0.127 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.212 | -0.394 | 0.913 | 0.216 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | -0.000 | -0.948 | 0.848 | 0.131 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.001 | -0.657 | 0.605 | 0.279 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.678 | 0.823 | 0.158 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.009 | -0.616 | 0.477 | 0.283 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.363 | 1.278 | 1.458 | 0.048 | torch.Size([30]) || stage7.reshape.1.weight + | -0.001 | -0.247 | 0.227 | 0.139 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.590 | 0.587 | 0.179 | torch.Size([120, 30]) || stage7.reshape.2.weight + | -0.029 | -0.525 | 0.546 | 0.231 | torch.Size([120]) || stage7.reshape.2.bias + | 0.406 | 0.101 | 0.864 | 0.138 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.159 | -0.667 | 0.525 | 0.161 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.174 | -2.385 | 4.798 | 0.381 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.809 | 0.687 | 0.111 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.275 | 0.262 | 0.057 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.416 | 0.438 | 0.096 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | 0.008 | -0.499 | 0.295 | 0.131 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.494 | 1.378 | 0.106 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.123 | 0.106 | 0.015 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.284 | 0.172 | 0.377 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.502 | 0.588 | 0.124 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.597 | 0.567 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.061 | -0.420 | 0.409 | 0.104 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.606 | 0.601 | 0.144 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | -0.003 | -0.306 | 0.261 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.572 | 0.609 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | -0.008 | -0.373 | 0.306 | 0.099 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.538 | 0.114 | 0.809 | 0.125 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.129 | -0.865 | 0.532 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.281 | -2.710 | 4.413 | 0.432 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.646 | 0.655 | 0.135 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.301 | 0.303 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.479 | 0.463 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.016 | -0.460 | 0.313 | 0.135 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -2.205 | 2.065 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.074 | 0.085 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.353 | 0.243 | 0.425 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | -0.008 | -0.643 | 0.628 | 0.146 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.535 | 0.617 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.054 | -0.348 | 0.244 | 0.109 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.001 | -0.671 | 0.611 | 0.148 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.004 | -0.272 | 0.292 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.672 | 0.595 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.398 | 0.273 | 0.088 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.581 | 0.093 | 0.791 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.143 | -1.023 | 0.481 | 0.167 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.098 | -2.171 | 4.402 | 0.287 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.640 | 0.701 | 0.147 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.005 | -0.328 | 0.408 | 0.072 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.417 | 0.441 | 0.101 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.007 | -0.508 | 0.265 | 0.127 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -2.511 | 2.484 | 0.143 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.093 | 0.104 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.392 | 0.276 | 0.487 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | -0.016 | -0.555 | 0.581 | 0.143 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.630 | 0.674 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.072 | -0.420 | 0.173 | 0.115 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.654 | 0.793 | 0.152 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.303 | 0.263 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.603 | 0.658 | 0.150 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.003 | -0.301 | 0.247 | 0.081 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.611 | 0.127 | 0.811 | 0.134 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.137 | -0.781 | 0.684 | 0.164 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.109 | -4.577 | 4.527 | 0.332 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.757 | 0.743 | 0.146 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.358 | 0.342 | 0.083 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.465 | 0.447 | 0.097 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.389 | 0.233 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -1.947 | 1.928 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.070 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.410 | 0.283 | 0.489 | 0.035 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | -0.014 | -0.442 | 0.639 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.542 | 0.585 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.069 | -0.463 | 0.214 | 0.122 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.689 | 0.605 | 0.154 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | -0.008 | -0.307 | 0.279 | 0.096 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.593 | 0.603 | 0.152 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.269 | 0.270 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.652 | 0.132 | 0.859 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.131 | -0.662 | 0.729 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.092 | -4.521 | 3.027 | 0.337 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.694 | 0.828 | 0.148 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.328 | 0.361 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.430 | 0.483 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.368 | 0.250 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -1.506 | 1.779 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.090 | 0.112 | 0.020 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.435 | 0.347 | 0.536 | 0.033 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.345 | 0.609 | 0.136 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | -0.001 | -0.580 | 0.558 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.066 | -0.392 | 0.239 | 0.128 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.608 | 0.667 | 0.157 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.001 | -0.276 | 0.296 | 0.105 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.666 | 0.775 | 0.155 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.380 | 0.360 | 0.101 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.648 | 0.269 | 0.885 | 0.109 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.116 | -0.436 | 0.749 | 0.144 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.130 | -3.976 | 4.665 | 0.318 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.702 | 0.671 | 0.140 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.346 | 0.340 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.410 | 0.394 | 0.091 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.006 | -0.286 | 0.244 | 0.100 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.870 | 0.885 | 0.109 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.120 | 0.096 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.445 | 0.326 | 0.595 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | -0.016 | -0.233 | 0.558 | 0.110 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.576 | 0.577 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.038 | -0.525 | 0.269 | 0.139 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.672 | 0.671 | 0.158 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.400 | 0.281 | 0.116 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.937 | 0.714 | 0.156 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.007 | -0.435 | 0.876 | 0.188 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.234 | 0.212 | 0.056 | torch.Size([120, 120]) || stage7.linear1.weight + | -0.033 | -0.655 | 0.586 | 0.242 | torch.Size([120]) || stage7.linear1.bias + | 0.684 | 0.257 | 0.867 | 0.090 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | -0.003 | -0.857 | 0.829 | 0.193 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.005 | -5.628 | 1.358 | 0.121 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.699 | 0.827 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.821 | 0.662 | 0.143 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.392 | 0.418 | 0.106 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | 0.003 | -0.147 | 0.171 | 0.052 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.431 | 0.316 | 0.521 | 0.036 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | -0.003 | -0.595 | 0.673 | 0.129 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.701 | 0.542 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | 0.017 | -0.290 | 0.421 | 0.117 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.603 | 0.637 | 0.145 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.394 | 0.426 | 0.098 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.602 | 0.607 | 0.144 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.460 | 0.272 | 0.112 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.655 | 0.251 | 0.779 | 0.074 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | -0.004 | -0.718 | 0.811 | 0.153 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.007 | -3.104 | 1.224 | 0.101 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.664 | 0.647 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.532 | 0.746 | 0.150 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.428 | 0.360 | 0.100 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.244 | 0.242 | 0.063 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.442 | 0.284 | 0.530 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | -0.004 | -0.421 | 0.664 | 0.106 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | -0.001 | -0.604 | 0.583 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | 0.028 | -0.389 | 0.406 | 0.134 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | -0.001 | -0.681 | 0.818 | 0.148 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | 0.003 | -0.247 | 0.361 | 0.096 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.783 | 0.835 | 0.146 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | 0.008 | -0.529 | 0.922 | 0.144 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.353 | 0.277 | 0.071 | torch.Size([120, 120]) || stage7.linear2.weight + | -0.026 | -0.905 | 0.749 | 0.262 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.125 | 0.138 | 0.027 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | -0.003 | -0.091 | 0.071 | 0.030 | torch.Size([120]) || stage7.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.000 | -0.028 | 0.054 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.130 | 0.111 | 0.017 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.004 | -0.105 | 0.094 | 0.040 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.203 | 0.124 | 0.016 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.027 | -0.097 | 0.151 | 0.048 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage7.pa_deform.conv_offset.6.bias + | -0.002 | -0.997 | 1.031 | 0.156 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.219 | -0.261 | 0.769 | 0.213 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.001 | -1.119 | 1.206 | 0.175 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.011 | -0.547 | 0.598 | 0.195 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.000 | -0.860 | 0.957 | 0.160 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | 0.018 | -1.017 | 0.731 | 0.363 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.491 | 1.080 | 1.847 | 0.135 | torch.Size([120]) || stage8.0.1.weight + | -0.012 | -0.370 | 0.414 | 0.140 | torch.Size([120]) || stage8.0.1.bias + | -0.000 | -0.882 | 1.114 | 0.177 | torch.Size([180, 120]) || stage8.0.2.weight + | -0.005 | -1.101 | 0.699 | 0.167 | torch.Size([180]) || stage8.0.2.bias + | 0.622 | 0.186 | 1.009 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.006 | -0.884 | 1.056 | 0.212 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.003 | -2.578 | 2.238 | 0.223 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.042 | 1.335 | 0.152 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.992 | 0.938 | 0.208 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.692 | 0.565 | 0.129 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | 0.009 | -1.288 | 0.895 | 0.185 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.415 | 0.180 | 0.539 | 0.066 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.006 | -0.634 | 0.818 | 0.145 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | 0.001 | -0.969 | 0.867 | 0.145 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.055 | -0.545 | 0.271 | 0.110 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.698 | 0.845 | 0.153 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.007 | -0.526 | 0.444 | 0.126 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.812 | 0.874 | 0.155 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | 0.009 | -0.468 | 0.864 | 0.160 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.724 | 0.198 | 0.915 | 0.128 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.003 | -1.026 | 0.953 | 0.209 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | 0.030 | -3.042 | 1.112 | 0.227 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.192 | 0.952 | 0.169 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | -0.009 | -1.186 | 0.822 | 0.191 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.500 | 0.647 | 0.121 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.892 | 1.020 | 0.208 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.492 | 0.230 | 0.628 | 0.064 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.006 | -0.853 | 0.872 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | 0.001 | -0.748 | 0.701 | 0.150 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.055 | -0.409 | 0.305 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.806 | 0.662 | 0.155 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.304 | 0.419 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.841 | 0.781 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | 0.005 | -0.280 | 0.641 | 0.119 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.803 | 0.314 | 1.038 | 0.110 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.006 | -1.202 | 1.119 | 0.207 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.002 | -2.783 | 1.481 | 0.236 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.957 | 0.943 | 0.162 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.519 | 0.526 | 0.136 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.543 | 0.516 | 0.117 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | 0.005 | -0.711 | 0.838 | 0.184 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.549 | 0.206 | 0.679 | 0.078 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.005 | -0.888 | 0.879 | 0.154 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | 0.000 | -0.748 | 0.896 | 0.148 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.073 | -0.478 | 0.193 | 0.098 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.628 | 0.674 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | -0.001 | -0.331 | 0.230 | 0.082 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.677 | 0.673 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | 0.004 | -0.294 | 0.745 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.843 | 0.308 | 0.966 | 0.094 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.002 | -1.222 | 1.324 | 0.192 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | 0.001 | -2.899 | 2.240 | 0.272 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.999 | 0.935 | 0.167 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.612 | 0.531 | 0.127 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.591 | 0.537 | 0.112 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.005 | -0.476 | 1.034 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.534 | 0.198 | 0.660 | 0.074 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.006 | -0.845 | 0.869 | 0.130 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | 0.001 | -0.649 | 0.677 | 0.147 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.080 | -0.378 | 0.228 | 0.109 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.628 | 0.683 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.005 | -0.300 | 0.222 | 0.083 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.959 | 0.733 | 0.153 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | 0.003 | -0.915 | 0.961 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.001 | -0.411 | 0.533 | 0.070 | torch.Size([180, 180]) || stage8.1.linear.weight + | -0.004 | -0.907 | 0.257 | 0.135 | torch.Size([180]) || stage8.1.linear.bias + | 0.890 | 0.143 | 1.178 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.034 | -0.781 | 0.959 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | 0.001 | -2.545 | 1.182 | 0.186 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.151 | 1.199 | 0.158 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.731 | 0.744 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.522 | 0.577 | 0.131 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | 0.003 | -0.537 | 0.895 | 0.164 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.599 | 0.203 | 0.779 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.021 | -0.429 | 1.016 | 0.143 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.000 | -0.914 | 0.736 | 0.145 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.054 | -0.545 | 0.183 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.716 | 0.750 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.254 | 0.408 | 0.085 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.842 | 0.706 | 0.153 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | 0.001 | -0.277 | 0.365 | 0.093 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.910 | 0.151 | 1.164 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.032 | -0.801 | 1.151 | 0.191 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.069 | -2.776 | 5.771 | 0.290 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.359 | 1.101 | 0.156 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.009 | -0.624 | 0.654 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.565 | 0.575 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.004 | -0.671 | 0.566 | 0.171 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.609 | 0.206 | 0.818 | 0.109 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.022 | -0.474 | 1.079 | 0.147 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | 0.000 | -0.760 | 0.819 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.045 | -0.414 | 0.277 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.831 | 0.809 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.002 | -0.544 | 0.244 | 0.082 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.749 | 0.962 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | 0.011 | -0.275 | 0.294 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.990 | 0.168 | 1.270 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.034 | -0.773 | 1.134 | 0.182 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.070 | -2.190 | 5.577 | 0.255 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.004 | 1.113 | 0.152 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.781 | 0.551 | 0.137 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.580 | 0.572 | 0.141 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.001 | -0.554 | 0.820 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.642 | 0.178 | 0.852 | 0.111 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.025 | -0.413 | 0.853 | 0.124 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.000 | -0.780 | 1.141 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.067 | -0.860 | 0.177 | 0.114 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -1.067 | 0.859 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.002 | -0.298 | 0.225 | 0.072 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.726 | 0.809 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.394 | 0.292 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.990 | 0.219 | 1.226 | 0.130 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.032 | -0.837 | 1.156 | 0.168 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.005 | -4.045 | 1.695 | 0.178 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.855 | 1.101 | 0.153 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.706 | 0.841 | 0.123 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.586 | 0.699 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | 0.001 | -0.402 | 0.842 | 0.173 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.613 | 0.196 | 0.800 | 0.102 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.021 | -0.404 | 0.907 | 0.115 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | 0.000 | -0.718 | 0.654 | 0.138 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.064 | -0.568 | 0.205 | 0.115 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.001 | -0.674 | 0.596 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | -0.012 | -0.279 | 0.171 | 0.073 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.634 | 0.692 | 0.150 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | 0.010 | -0.528 | 1.331 | 0.175 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.361 | 0.549 | 0.078 | torch.Size([180, 180]) || stage8.2.linear.weight + | -0.001 | -0.682 | 0.349 | 0.142 | torch.Size([180]) || stage8.2.linear.bias + | 1.018 | 0.177 | 1.365 | 0.177 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.033 | -0.673 | 0.916 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | 0.003 | -2.963 | 1.620 | 0.138 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -1.095 | 0.939 | 0.152 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | 0.004 | -0.725 | 0.682 | 0.135 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.731 | 0.755 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | 0.013 | -0.457 | 0.481 | 0.158 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.703 | 0.276 | 0.865 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.024 | -0.449 | 0.966 | 0.132 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.001 | -0.873 | 0.665 | 0.138 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.052 | -0.479 | 0.198 | 0.104 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.787 | 0.699 | 0.155 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.003 | -0.436 | 0.264 | 0.081 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.675 | 0.689 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.265 | 0.254 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.956 | 0.184 | 1.255 | 0.167 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.036 | -0.699 | 0.965 | 0.155 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.038 | -3.913 | 4.625 | 0.210 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.142 | 0.934 | 0.147 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.000 | -0.708 | 0.560 | 0.117 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | -0.002 | -0.746 | 0.626 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | 0.021 | -0.378 | 0.376 | 0.127 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.741 | 0.282 | 0.933 | 0.107 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.028 | -0.425 | 0.898 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.001 | -0.761 | 0.822 | 0.139 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.057 | -0.502 | 0.219 | 0.100 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.829 | 0.872 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.262 | 0.226 | 0.077 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.797 | 0.765 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.002 | -0.360 | 0.289 | 0.109 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 1.068 | 0.207 | 1.335 | 0.160 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.034 | -0.784 | 1.005 | 0.163 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.004 | -2.897 | 1.185 | 0.143 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -1.055 | 0.899 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | -0.000 | -0.572 | 0.670 | 0.120 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.729 | 0.798 | 0.156 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | 0.025 | -0.570 | 0.501 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.759 | 0.228 | 0.969 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.025 | -0.394 | 0.791 | 0.103 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.001 | -0.962 | 0.903 | 0.137 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.064 | -0.587 | 0.209 | 0.108 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.966 | 0.925 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.004 | -0.366 | 0.239 | 0.074 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.782 | 0.817 | 0.152 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | 0.003 | -0.321 | 0.340 | 0.117 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 1.082 | 0.237 | 1.309 | 0.144 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.031 | -0.726 | 0.933 | 0.149 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | 0.005 | -3.023 | 1.093 | 0.142 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.830 | 0.867 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.487 | 0.710 | 0.107 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.940 | 0.725 | 0.157 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | 0.027 | -0.522 | 0.807 | 0.170 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.705 | 0.249 | 0.868 | 0.095 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.023 | -0.426 | 0.826 | 0.108 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.000 | -0.814 | 0.927 | 0.131 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.043 | -0.613 | 0.209 | 0.116 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.709 | 0.851 | 0.154 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.225 | 0.241 | 0.078 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.857 | 0.845 | 0.151 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | 0.016 | -0.441 | 1.206 | 0.183 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.437 | 0.634 | 0.077 | torch.Size([180, 180]) || stage8.3.linear.weight + | -0.003 | -0.564 | 0.338 | 0.145 | torch.Size([180]) || stage8.3.linear.bias + | 1.164 | 0.238 | 1.496 | 0.205 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.033 | -0.667 | 0.780 | 0.170 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.002 | -3.025 | 1.339 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.736 | 0.735 | 0.147 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.468 | 0.575 | 0.112 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.725 | 0.750 | 0.162 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.004 | -0.461 | 0.540 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.804 | 0.361 | 0.962 | 0.091 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.025 | -0.421 | 0.837 | 0.127 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.664 | 0.869 | 0.129 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.028 | -0.519 | 0.180 | 0.098 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.793 | 0.821 | 0.156 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.235 | 0.329 | 0.081 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.758 | 0.730 | 0.153 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | 0.010 | -0.332 | 0.306 | 0.118 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.097 | 0.202 | 1.361 | 0.200 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.034 | -0.597 | 0.687 | 0.147 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.007 | -4.645 | 1.140 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.002 | 0.810 | 0.144 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.407 | 0.438 | 0.108 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.646 | 0.678 | 0.154 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.418 | 0.415 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.836 | 0.316 | 1.026 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.024 | -0.364 | 0.851 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.690 | 0.848 | 0.128 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.032 | -0.484 | 0.195 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.863 | 0.768 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | -0.001 | -0.319 | 0.409 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.836 | 0.822 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | 0.019 | -0.356 | 0.374 | 0.129 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.151 | 0.229 | 1.393 | 0.176 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.028 | -0.649 | 0.925 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | -0.005 | -3.864 | 1.138 | 0.140 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.813 | 0.897 | 0.146 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.449 | 0.486 | 0.103 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.739 | 0.710 | 0.175 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.000 | -0.542 | 0.407 | 0.162 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.820 | 0.329 | 0.989 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.025 | -0.461 | 0.753 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.001 | -0.648 | 0.788 | 0.125 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.015 | -0.501 | 0.248 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.745 | 0.796 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | 0.007 | -0.244 | 0.231 | 0.080 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.771 | 1.049 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | 0.018 | -0.360 | 0.336 | 0.143 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.177 | 0.269 | 1.385 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.028 | -0.700 | 0.877 | 0.145 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | -0.005 | -2.684 | 0.830 | 0.097 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.996 | 0.727 | 0.142 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.326 | 0.449 | 0.101 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.777 | 0.785 | 0.170 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | 0.004 | -0.396 | 0.449 | 0.158 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.790 | 0.392 | 1.005 | 0.078 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.030 | -0.481 | 0.719 | 0.110 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.569 | 0.732 | 0.121 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | 0.020 | -0.670 | 0.335 | 0.125 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.822 | 0.831 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.282 | 0.296 | 0.089 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.856 | 0.886 | 0.155 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | 0.029 | -0.390 | 0.437 | 0.161 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.490 | 0.625 | 0.079 | torch.Size([180, 180]) || stage8.4.linear.weight + | -0.002 | -0.573 | 0.398 | 0.168 | torch.Size([180]) || stage8.4.linear.bias + | 1.337 | 0.163 | 1.694 | 0.268 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.025 | -0.727 | 1.008 | 0.186 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.738 | -2.885 | 5.812 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.852 | 0.854 | 0.135 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | -0.005 | -0.546 | 0.550 | 0.112 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.901 | 0.781 | 0.195 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.020 | -0.545 | 0.469 | 0.173 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.956 | 0.367 | 1.185 | 0.129 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.033 | -0.519 | 0.833 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.001 | -0.832 | 0.580 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | 0.055 | -0.256 | 0.378 | 0.097 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -1.058 | 0.859 | 0.154 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.377 | 0.318 | 0.093 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | -0.001 | -0.751 | 0.766 | 0.156 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.011 | -0.316 | 0.323 | 0.132 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.346 | 0.151 | 1.746 | 0.272 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.023 | -0.691 | 0.993 | 0.169 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.705 | -2.997 | 4.745 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.911 | 0.984 | 0.141 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | -0.011 | -0.405 | 0.288 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.853 | 0.977 | 0.210 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.516 | 0.596 | 0.170 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 1.021 | 0.333 | 1.268 | 0.154 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.034 | -0.512 | 0.812 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | 0.000 | -0.561 | 0.546 | 0.120 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | 0.050 | -0.450 | 0.320 | 0.100 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.001 | -0.907 | 0.752 | 0.157 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | -0.008 | -0.306 | 0.343 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.891 | 0.741 | 0.158 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.014 | -0.407 | 0.478 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.266 | 0.195 | 1.640 | 0.251 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.028 | -0.680 | 0.987 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.515 | -2.839 | 4.668 | 0.636 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | 0.001 | -0.968 | 0.890 | 0.144 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.372 | 0.390 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -1.001 | 0.995 | 0.221 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.012 | -0.576 | 0.456 | 0.172 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 1.046 | 0.311 | 1.264 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.033 | -0.519 | 0.785 | 0.123 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | 0.000 | -0.533 | 0.563 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | 0.053 | -0.314 | 0.364 | 0.109 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.862 | 0.822 | 0.158 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.266 | 0.289 | 0.084 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.787 | 0.886 | 0.161 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.421 | 0.503 | 0.171 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.226 | 0.277 | 1.561 | 0.208 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.032 | -0.670 | 1.030 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.401 | -1.953 | 3.930 | 0.598 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.857 | 0.754 | 0.139 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.317 | 0.278 | 0.081 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | -0.002 | -1.022 | 0.999 | 0.200 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.009 | -0.384 | 0.393 | 0.165 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 1.038 | 0.340 | 1.216 | 0.128 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.034 | -0.574 | 0.775 | 0.124 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | 0.001 | -0.588 | 0.613 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | 0.063 | -0.447 | 0.307 | 0.111 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.873 | 0.775 | 0.159 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.456 | 0.435 | 0.092 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.819 | 0.772 | 0.160 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.018 | -0.319 | 0.340 | 0.131 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.562 | 0.471 | 0.080 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.024 | -0.609 | 0.488 | 0.184 | torch.Size([180]) || stage8.5.linear.bias + | 1.369 | 0.171 | 1.961 | 0.355 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.028 | -0.642 | 0.733 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.029 | -1.759 | 1.624 | 0.312 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.686 | 0.691 | 0.113 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.261 | 0.301 | 0.081 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.736 | 0.637 | 0.149 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | -0.006 | -0.293 | 0.300 | 0.106 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 1.302 | 0.401 | 1.613 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.029 | -0.475 | 0.696 | 0.159 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.001 | -0.649 | 0.564 | 0.119 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.036 | -0.275 | 0.218 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.717 | 0.831 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.231 | 0.270 | 0.074 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.833 | 0.791 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.364 | 0.324 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.450 | 0.218 | 1.962 | 0.354 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.025 | -0.716 | 0.851 | 0.206 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.045 | -1.549 | 2.100 | 0.321 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.759 | 0.636 | 0.110 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.235 | 0.269 | 0.070 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.691 | 0.657 | 0.145 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | -0.007 | -0.375 | 0.328 | 0.116 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 1.326 | 0.335 | 1.596 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.029 | -0.566 | 0.748 | 0.160 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.002 | -0.667 | 0.591 | 0.121 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.042 | -0.387 | 0.373 | 0.078 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.685 | 0.894 | 0.147 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.000 | -0.353 | 0.326 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.801 | 0.692 | 0.149 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.331 | 0.273 | 0.127 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.416 | 0.215 | 1.819 | 0.303 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.024 | -0.596 | 0.869 | 0.211 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.038 | -2.355 | 1.330 | 0.286 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.964 | 0.732 | 0.112 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.192 | 0.251 | 0.052 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.736 | 0.624 | 0.138 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.376 | 0.254 | 0.119 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.352 | 0.217 | 1.546 | 0.187 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.023 | -0.627 | 0.881 | 0.164 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.001 | -0.616 | 0.688 | 0.122 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.040 | -0.332 | 0.242 | 0.083 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.970 | 0.669 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | 0.006 | -0.333 | 0.371 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.849 | 0.824 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.282 | 0.333 | 0.111 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.346 | 0.206 | 1.798 | 0.286 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.022 | -0.742 | 0.797 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.056 | -1.296 | 2.098 | 0.311 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.693 | 0.597 | 0.103 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | -0.003 | -0.211 | 0.161 | 0.055 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.767 | 0.663 | 0.127 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.011 | -0.269 | 0.169 | 0.072 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 1.329 | 0.247 | 1.544 | 0.183 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.023 | -0.619 | 0.881 | 0.171 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | -0.001 | -0.670 | 0.594 | 0.124 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.052 | -0.262 | 0.275 | 0.073 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.899 | 0.808 | 0.149 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.009 | -0.273 | 0.326 | 0.090 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.773 | 0.930 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | -0.001 | -0.264 | 0.261 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -1.128 | 1.483 | 0.100 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.014 | -0.757 | 0.769 | 0.160 | torch.Size([180]) || stage8.6.linear.bias + | 0.387 | 0.109 | 1.033 | 0.194 | torch.Size([180]) || norm.weight + | -0.006 | -0.754 | 0.773 | 0.142 | torch.Size([180]) || norm.bias + | 0.001 | -0.596 | 0.563 | 0.121 | torch.Size([120, 180]) || conv_after_body.weight + | -0.016 | -0.251 | 0.121 | 0.061 | torch.Size([120]) || conv_after_body.bias + | 0.003 | -1.347 | 1.476 | 0.161 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.090 | -0.847 | 0.182 | 0.193 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.002 | -1.602 | 0.994 | 0.114 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.059 | -0.461 | 0.137 | 0.098 | torch.Size([256]) || upsample.0.bias + | -0.005 | -4.099 | 0.822 | 0.076 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.137 | -0.426 | 0.152 | 0.097 | torch.Size([256]) || upsample.5.bias + | -0.000 | -0.377 | 0.324 | 0.014 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | -0.000 | -0.016 | 0.014 | 0.003 | torch.Size([64]) || upsample.10.bias + | -0.000 | -0.043 | 0.040 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:10:42.661 : task: 003_train_vrt_videosr_bi_vimeo_7frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: model_zoo/vrt/002_VRT_videosr_bi_REDS_16frames.pth + pretrained_netE: None + task: experiments/003_train_vrt_videosr_bi_vimeo_7frames + log: experiments/003_train_vrt_videosr_bi_vimeo_7frames + options: experiments/003_train_vrt_videosr_bi_vimeo_7frames/options + models: experiments/003_train_vrt_videosr_bi_vimeo_7frames/models + images: experiments/003_train_vrt_videosr_bi_vimeo_7frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainVimeoDataset + dataroot_gt: trainsets/vimeo90k + dataroot_lq: trainsets/vimeo90k + meta_info_file: data/meta_info/meta_info_Vimeo90K_train_GT.txt + io_backend:[ + type: disk + ] + num_frame: -1 + gt_size: 256 + interval_list: [1] + random_reverse: True + use_hflip: True + use_rot: True + pad_sequence: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/Vid4/GT + dataroot_lq: testsets/Vid4/BIx4 + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [8, 64, 64] + window_size: [8, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 4 + deformable_groups: 16 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: False + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 32 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/003_train_vrt_videosr_bi_vimeo_7frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:10:42.695 : Number of train images: 64,612, iters: 8,077 +22-03-11 10:10:46.280 : +Networks name: VRT +Params number: 32577991 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:10:46.456 : + | mean | min | max | std || shape + | 0.000 | -1.496 | 1.623 | 0.115 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | -0.005 | -1.075 | 0.916 | 0.274 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.656 | 0.699 | 0.067 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.037 | -0.877 | 0.359 | 0.346 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.007 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.063 | -1.264 | 0.752 | 0.323 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.158 | -0.704 | 0.861 | 0.357 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.779 | -1.061 | 1.164 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.148 | 0.161 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.002 | -0.000 | 0.004 | 0.003 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.745 | 0.760 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.019 | -0.848 | 0.359 | 0.331 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.373 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.037 | -1.227 | 0.720 | 0.303 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.009 | -4.425 | 0.539 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.158 | -0.758 | 0.988 | 0.386 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.647 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.777 | -1.211 | 1.152 | 0.550 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | 0.000 | -0.126 | 0.144 | 0.017 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | 0.004 | 0.001 | 0.008 | 0.005 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.938 | 0.872 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.028 | -1.086 | 0.552 | 0.435 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.203 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.022 | -1.298 | 0.715 | 0.312 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.010 | -1.806 | 0.627 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.118 | -0.698 | 0.750 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.014 | -1.277 | 0.337 | 0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.684 | -1.730 | 0.954 | 0.648 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.031 | 0.042 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.010 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.956 | 0.847 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.049 | -1.175 | 0.652 | 0.477 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.892 | 1.180 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.021 | -1.294 | 0.764 | 0.316 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.793 | 0.556 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.123 | -0.717 | 0.737 | 0.335 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.102 | 0.291 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.650 | -1.838 | 0.913 | 0.669 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.032 | 0.039 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.000 | -0.012 | 0.012 | 0.017 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.953 | 0.855 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.009 | -1.001 | 0.584 | 0.427 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.054 | 1.223 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.023 | -1.315 | 0.884 | 0.326 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.786 | 0.534 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.142 | -0.698 | 0.780 | 0.342 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.957 | 0.276 | 0.057 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.653 | -1.854 | 0.943 | 0.677 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.035 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.010 | 0.008 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.918 | 0.865 | 0.087 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.047 | -0.824 | 0.510 | 0.392 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.094 | 1.213 | 0.118 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.029 | -1.319 | 0.938 | 0.330 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.007 | -1.794 | 0.519 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.145 | -0.725 | 0.830 | 0.349 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.008 | -0.766 | 0.275 | 0.052 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.659 | -1.945 | 0.999 | 0.706 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.025 | 0.026 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.014 | 0.001 | 0.027 | 0.018 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.335 | 0.614 | 2.324 | 0.313 | torch.Size([120]) || stage1.reshape.1.weight + | -0.007 | -0.451 | 0.392 | 0.149 | torch.Size([120]) || stage1.reshape.1.bias + | 0.640 | 0.164 | 1.487 | 0.258 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.072 | -1.225 | 0.558 | 0.260 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.295 | -4.200 | 2.891 | 0.402 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | 0.001 | -0.736 | 0.771 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.412 | 0.503 | 0.106 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.711 | 0.595 | 0.091 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.006 | -0.195 | 0.530 | 0.097 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.076 | 1.181 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.228 | 0.294 | 0.059 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.836 | 0.408 | 1.248 | 0.162 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.042 | -0.494 | 0.495 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.003 | -0.889 | 0.982 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | 0.041 | -0.364 | 0.458 | 0.117 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.757 | 0.882 | 0.140 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.400 | 0.470 | 0.157 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.852 | 1.093 | 0.139 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | 0.022 | -0.265 | 0.384 | 0.096 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.894 | 0.195 | 1.588 | 0.211 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.156 | -1.734 | 0.260 | 0.208 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.433 | -4.335 | 2.455 | 0.555 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | -0.001 | -1.631 | 1.615 | 0.174 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | 0.005 | -0.246 | 0.392 | 0.072 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.697 | 0.574 | 0.098 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | 0.011 | -0.191 | 0.529 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.260 | 1.186 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.207 | 0.162 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.725 | 0.421 | 0.899 | 0.072 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.043 | -0.750 | 0.403 | 0.161 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | -0.001 | -0.950 | 0.899 | 0.146 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.001 | -0.381 | 0.301 | 0.092 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.615 | 0.630 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.009 | -0.473 | 0.647 | 0.131 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | 0.001 | -0.789 | 0.813 | 0.146 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | -0.041 | -0.335 | 0.331 | 0.119 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 1.087 | 0.163 | 1.663 | 0.218 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.188 | -1.539 | 0.134 | 0.175 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.505 | -4.230 | 3.070 | 0.545 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | -0.000 | -1.348 | 1.453 | 0.171 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | 0.007 | -0.394 | 0.633 | 0.080 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | 0.001 | -0.561 | 0.466 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | 0.028 | -0.263 | 0.277 | 0.111 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.982 | 1.268 | 0.124 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.139 | 0.149 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.743 | 0.234 | 0.925 | 0.092 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.030 | -1.015 | 0.440 | 0.156 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | -0.002 | -0.956 | 1.234 | 0.155 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | 0.003 | -0.419 | 0.302 | 0.108 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.723 | 0.609 | 0.143 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.362 | 0.529 | 0.129 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.768 | 0.645 | 0.147 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | -0.033 | -0.281 | 0.244 | 0.100 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.076 | 0.178 | 1.503 | 0.199 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.153 | -1.699 | 0.096 | 0.171 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.815 | -4.386 | 4.546 | 0.797 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.001 | -2.332 | 2.215 | 0.164 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.004 | -0.455 | 0.400 | 0.070 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.504 | 0.556 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.006 | -0.339 | 0.365 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -1.444 | 1.191 | 0.122 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.162 | 0.140 | 0.029 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.715 | 0.229 | 0.865 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.026 | -1.011 | 0.287 | 0.151 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | -0.003 | -0.761 | 0.828 | 0.148 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | 0.014 | -0.337 | 0.418 | 0.135 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.716 | 0.712 | 0.149 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | 0.003 | -0.427 | 0.369 | 0.124 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.719 | 0.640 | 0.151 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | -0.010 | -0.557 | 0.227 | 0.103 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.161 | 0.188 | 1.556 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.165 | -1.773 | 0.054 | 0.186 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.575 | -3.741 | 5.261 | 0.767 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.000 | -2.020 | 2.251 | 0.173 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.000 | -0.318 | 0.312 | 0.071 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.463 | 0.456 | 0.112 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.406 | 0.393 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.968 | 1.330 | 0.123 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.152 | 0.176 | 0.030 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.699 | 0.230 | 0.850 | 0.073 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.029 | -1.033 | 0.300 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.002 | -0.718 | 0.803 | 0.145 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | 0.002 | -0.389 | 0.405 | 0.139 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | -0.001 | -0.582 | 0.624 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | 0.003 | -0.385 | 0.386 | 0.118 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.677 | 0.737 | 0.153 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | 0.003 | -0.671 | 0.208 | 0.108 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.067 | 0.173 | 1.473 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.129 | -1.487 | 0.138 | 0.166 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.530 | -3.629 | 3.705 | 0.621 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | 0.000 | -2.344 | 1.768 | 0.157 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.428 | 0.265 | 0.082 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | -0.001 | -0.541 | 0.559 | 0.120 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | 0.031 | -0.324 | 0.379 | 0.133 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | -0.001 | -1.380 | 0.992 | 0.120 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.100 | 0.111 | 0.027 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.637 | 0.273 | 0.780 | 0.064 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.022 | -1.160 | 0.338 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.002 | -0.696 | 0.638 | 0.139 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | 0.007 | -0.366 | 0.364 | 0.134 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.001 | -0.581 | 0.657 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | -0.004 | -0.366 | 0.244 | 0.105 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -1.143 | 0.787 | 0.154 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | 0.023 | -1.254 | 0.407 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.293 | 0.270 | 0.065 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.006 | -0.209 | 0.382 | 0.093 | torch.Size([120]) || stage1.linear1.bias + | 0.811 | 0.432 | 1.092 | 0.108 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.033 | -0.763 | 0.477 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.049 | -2.996 | 1.734 | 0.246 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.847 | 1.215 | 0.150 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.542 | 0.581 | 0.147 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.536 | 0.569 | 0.124 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.004 | -0.195 | 0.602 | 0.102 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.568 | 0.438 | 0.872 | 0.074 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.025 | -0.782 | 0.342 | 0.164 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.003 | -0.601 | 0.699 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.068 | -0.329 | 0.446 | 0.095 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | 0.001 | -0.807 | 0.710 | 0.143 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.585 | 0.392 | 0.117 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.779 | 0.575 | 0.142 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | 0.008 | -0.377 | 0.374 | 0.159 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.942 | 0.411 | 1.171 | 0.093 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.038 | -0.837 | 0.321 | 0.152 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.077 | -2.150 | 2.175 | 0.237 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.750 | 0.771 | 0.159 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.589 | 0.559 | 0.145 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.478 | 0.525 | 0.125 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.338 | 0.449 | 0.154 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.597 | 0.429 | 0.741 | 0.044 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.038 | -0.697 | 0.195 | 0.103 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.003 | -0.671 | 0.636 | 0.135 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.057 | -0.519 | 0.422 | 0.139 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.629 | 0.607 | 0.153 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.279 | 0.403 | 0.083 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.001 | -0.620 | 0.712 | 0.150 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | 0.014 | -0.721 | 0.333 | 0.163 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.504 | 0.343 | 0.079 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.015 | -0.276 | 0.353 | 0.122 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.151 | 0.136 | 0.025 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.087 | 0.103 | 0.030 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.004 | -0.024 | 0.040 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.122 | 0.123 | 0.017 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.009 | -0.068 | 0.068 | 0.028 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.175 | 0.114 | 0.015 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | 0.019 | -0.059 | 0.110 | 0.042 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage1.pa_deform.conv_offset.6.bias + | -0.001 | -1.034 | 1.208 | 0.150 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.085 | -0.220 | 0.682 | 0.164 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.305 | 1.408 | 0.167 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.005 | -0.474 | 0.521 | 0.147 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.941 | 0.939 | 0.158 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | 0.019 | -0.993 | 0.852 | 0.371 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.099 | 0.165 | 1.669 | 0.285 | torch.Size([480]) || stage2.reshape.1.weight + | -0.009 | -0.723 | 0.825 | 0.237 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.767 | 0.672 | 0.163 | torch.Size([120, 480]) || stage2.reshape.2.weight + | -0.007 | -0.473 | 0.285 | 0.116 | torch.Size([120]) || stage2.reshape.2.bias + | 0.665 | 0.267 | 1.019 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.152 | -0.897 | 0.303 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.208 | -1.940 | 4.459 | 0.383 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.653 | 0.613 | 0.127 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.263 | 0.270 | 0.066 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | 0.002 | -0.796 | 0.596 | 0.108 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.008 | -0.955 | 0.285 | 0.127 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -1.099 | 0.979 | 0.109 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.131 | 0.090 | 0.022 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.548 | 0.301 | 0.671 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.744 | 0.803 | 0.231 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.645 | 0.555 | 0.133 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | 0.013 | -0.406 | 0.272 | 0.097 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.622 | 0.666 | 0.147 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.002 | -0.228 | 0.307 | 0.085 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.834 | 0.822 | 0.149 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.009 | -0.948 | 0.446 | 0.159 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.777 | 0.311 | 1.104 | 0.161 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.178 | -0.966 | 0.822 | 0.247 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.387 | -2.000 | 5.826 | 0.443 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.662 | 0.706 | 0.132 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | -0.006 | -0.348 | 0.306 | 0.079 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | -0.001 | -0.595 | 0.730 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.001 | -0.811 | 0.531 | 0.167 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -1.007 | 1.002 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.180 | 0.108 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.599 | 0.282 | 0.730 | 0.059 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | -0.004 | -0.671 | 0.938 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.536 | 0.570 | 0.134 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.022 | -0.540 | 0.226 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.646 | 0.589 | 0.149 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.008 | -0.203 | 0.282 | 0.092 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -1.052 | 0.649 | 0.150 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.581 | 0.467 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.780 | 0.134 | 1.161 | 0.193 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.152 | -0.996 | 1.042 | 0.227 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.186 | -2.565 | 4.152 | 0.428 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | 0.001 | -0.856 | 0.814 | 0.151 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.367 | 0.317 | 0.074 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.656 | 0.730 | 0.131 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | -0.003 | -0.555 | 0.620 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -2.191 | 2.575 | 0.137 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.121 | 0.139 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.640 | 0.297 | 0.797 | 0.064 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | -0.013 | -0.584 | 0.934 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.523 | 0.556 | 0.136 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.035 | -0.490 | 0.217 | 0.117 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.679 | 0.601 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.287 | 0.308 | 0.098 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.576 | 0.584 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.423 | 0.376 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.776 | 0.134 | 1.030 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.167 | -0.870 | 1.066 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.259 | -1.735 | 5.189 | 0.366 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.292 | 1.255 | 0.149 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.493 | 0.445 | 0.101 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.618 | 0.582 | 0.122 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.001 | -0.543 | 0.420 | 0.166 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | 0.002 | -2.296 | 2.630 | 0.162 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.130 | 0.149 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.625 | 0.301 | 0.772 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | -0.015 | -0.498 | 0.992 | 0.198 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.620 | 0.681 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.006 | -0.391 | 0.256 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.575 | 0.669 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.225 | 0.333 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.680 | 0.639 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.011 | -0.549 | 0.259 | 0.139 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.933 | 0.310 | 1.186 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.180 | -0.736 | 1.168 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.164 | -2.965 | 4.145 | 0.437 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.860 | 0.749 | 0.136 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | 0.005 | -0.274 | 0.308 | 0.080 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.648 | 0.681 | 0.129 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.547 | 0.295 | 0.149 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.647 | 0.577 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.138 | 0.125 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.635 | 0.329 | 0.748 | 0.049 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.375 | 0.891 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.603 | 0.497 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.010 | -0.345 | 0.297 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.680 | 0.679 | 0.153 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.200 | 0.251 | 0.086 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.568 | 0.614 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.009 | -0.375 | 0.493 | 0.135 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.870 | 0.315 | 1.059 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.139 | -0.657 | 1.107 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.156 | -4.167 | 4.651 | 0.340 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.701 | 0.871 | 0.134 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.427 | 0.471 | 0.099 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.520 | 0.546 | 0.113 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | -0.008 | -0.360 | 0.350 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.510 | 0.502 | 0.100 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.125 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.597 | 0.345 | 0.691 | 0.044 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | -0.015 | -0.367 | 0.987 | 0.132 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.552 | 0.532 | 0.128 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.009 | -0.336 | 0.253 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.644 | 0.758 | 0.154 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.243 | 0.264 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.667 | 0.621 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | -0.002 | -0.447 | 1.139 | 0.183 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.002 | -0.268 | 0.331 | 0.066 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.005 | -0.338 | 0.589 | 0.128 | torch.Size([120]) || stage2.linear1.bias + | 0.939 | 0.517 | 1.207 | 0.113 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.023 | -0.770 | 0.614 | 0.238 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.004 | -3.112 | 1.341 | 0.140 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.605 | 0.580 | 0.136 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.591 | 0.477 | 0.112 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.645 | 0.613 | 0.150 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.031 | -0.422 | 0.330 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.684 | 0.501 | 0.807 | 0.061 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.018 | -0.693 | 0.412 | 0.181 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.559 | 0.715 | 0.125 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | 0.031 | -0.346 | 0.273 | 0.108 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.744 | 0.559 | 0.146 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.239 | 0.270 | 0.080 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.603 | 0.871 | 0.144 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.317 | 0.303 | 0.122 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 0.974 | 0.575 | 1.211 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.023 | -0.703 | 0.556 | 0.208 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.012 | -2.867 | 1.552 | 0.185 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.743 | 0.663 | 0.142 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.647 | 0.654 | 0.141 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.610 | 0.648 | 0.151 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | -0.028 | -0.565 | 0.416 | 0.167 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.742 | 0.522 | 0.891 | 0.076 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.020 | -0.506 | 0.335 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.512 | 0.123 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.094 | -0.405 | 0.617 | 0.174 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.618 | 0.596 | 0.149 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.001 | -0.276 | 0.202 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.668 | 0.769 | 0.148 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.014 | -0.729 | 0.410 | 0.187 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.309 | 0.381 | 0.079 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.017 | -0.403 | 0.399 | 0.133 | torch.Size([120]) || stage2.linear2.bias + | -0.000 | -0.111 | 0.126 | 0.024 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | 0.001 | -0.031 | 0.055 | 0.017 | torch.Size([120]) || stage2.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.021 | 0.012 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | -0.001 | -0.113 | 0.096 | 0.020 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.010 | -0.089 | 0.087 | 0.032 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.079 | 0.087 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.015 | -0.134 | 0.121 | 0.058 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage2.pa_deform.conv_offset.6.bias + | 0.004 | -1.011 | 1.138 | 0.150 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.151 | -0.228 | 0.674 | 0.167 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | 0.001 | -0.988 | 1.066 | 0.144 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | 0.009 | -0.418 | 0.533 | 0.127 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.784 | 0.831 | 0.151 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.007 | -0.581 | 0.470 | 0.257 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.105 | 0.504 | 1.774 | 0.248 | torch.Size([480]) || stage3.reshape.1.weight + | -0.006 | -0.633 | 0.736 | 0.296 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.682 | 0.687 | 0.168 | torch.Size([120, 480]) || stage3.reshape.2.weight + | -0.004 | -0.207 | 0.227 | 0.086 | torch.Size([120]) || stage3.reshape.2.bias + | 0.735 | 0.431 | 0.997 | 0.127 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.162 | -0.753 | 0.303 | 0.198 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.490 | 0.344 | 0.037 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.333 | 0.350 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | -0.004 | -0.195 | 0.128 | 0.039 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.359 | 0.365 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | -0.002 | -0.216 | 0.262 | 0.084 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.597 | 0.657 | 0.058 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.115 | 0.118 | 0.020 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.594 | 0.414 | 0.775 | 0.069 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.260 | 0.315 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.446 | 0.536 | 0.116 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.077 | -0.361 | 0.145 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.507 | 0.503 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.225 | 0.207 | 0.062 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.553 | 0.493 | 0.129 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | -0.006 | -0.268 | 0.158 | 0.085 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.716 | 0.376 | 0.965 | 0.119 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.185 | -0.732 | 0.209 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | -0.002 | -0.462 | 1.414 | 0.064 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.383 | 0.438 | 0.060 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.229 | 0.157 | 0.044 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.357 | 0.478 | 0.065 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | -0.004 | -0.280 | 0.216 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.471 | 0.517 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.112 | 0.131 | 0.022 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.633 | 0.486 | 0.778 | 0.057 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.350 | 0.280 | 0.107 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.513 | 0.512 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.081 | -0.274 | 0.096 | 0.071 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.548 | 0.533 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | -0.003 | -0.181 | 0.194 | 0.059 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.499 | 0.534 | 0.128 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.282 | 0.152 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.796 | 0.469 | 1.007 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.109 | -0.638 | 0.181 | 0.146 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.004 | -1.009 | 1.155 | 0.105 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.378 | 0.375 | 0.081 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.263 | 0.331 | 0.066 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.485 | 0.366 | 0.074 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.249 | 0.145 | 0.080 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.332 | 0.421 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.001 | -0.098 | 0.083 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.657 | 0.507 | 0.776 | 0.053 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.270 | 0.280 | 0.104 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.445 | 0.556 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.097 | -0.295 | 0.100 | 0.070 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.480 | 0.501 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.148 | 0.191 | 0.060 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.569 | 0.484 | 0.126 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.246 | 0.161 | 0.082 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.814 | 0.482 | 1.048 | 0.109 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.138 | -0.585 | 0.128 | 0.129 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | -0.008 | -1.801 | 4.148 | 0.110 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | -0.001 | -0.364 | 0.546 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.179 | 0.182 | 0.046 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.378 | 0.385 | 0.070 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | -0.005 | -0.368 | 0.175 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.338 | 0.461 | 0.062 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.098 | 0.082 | 0.019 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.676 | 0.526 | 0.799 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.269 | 0.242 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.474 | 0.505 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.095 | -0.247 | 0.071 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.518 | 0.502 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.003 | -0.194 | 0.228 | 0.068 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.502 | 0.499 | 0.124 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | -0.007 | -0.248 | 0.207 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.843 | 0.498 | 1.046 | 0.099 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.456 | 0.195 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | -0.012 | -3.133 | 2.263 | 0.177 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.001 | -0.494 | 0.443 | 0.096 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.492 | 0.329 | 0.088 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.464 | 0.391 | 0.080 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.420 | 0.332 | 0.124 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | 0.001 | -0.469 | 0.518 | 0.068 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.068 | 0.099 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.705 | 0.598 | 0.823 | 0.047 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.161 | 0.155 | 0.065 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.526 | 0.442 | 0.119 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.102 | -0.319 | 0.054 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.555 | 0.499 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | -0.003 | -0.201 | 0.135 | 0.065 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | 0.001 | -0.454 | 0.522 | 0.122 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | -0.011 | -0.379 | 0.195 | 0.091 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.856 | 0.618 | 1.073 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.059 | -0.368 | 0.153 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | -0.006 | -1.747 | 1.724 | 0.133 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.399 | 0.417 | 0.090 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | 0.009 | -0.294 | 0.398 | 0.079 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.345 | 0.341 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | -0.004 | -0.435 | 0.326 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.370 | 0.339 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.059 | 0.060 | 0.012 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.707 | 0.600 | 0.832 | 0.051 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.157 | 0.140 | 0.063 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.473 | 0.464 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.091 | -0.291 | 0.092 | 0.073 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.479 | 0.477 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.197 | 0.180 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.504 | 0.440 | 0.118 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | -0.008 | -0.449 | 0.421 | 0.135 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.331 | 0.524 | 0.083 | torch.Size([120, 120]) || stage3.linear1.weight + | -0.001 | -0.270 | 0.250 | 0.116 | torch.Size([120]) || stage3.linear1.bias + | 0.883 | 0.354 | 1.107 | 0.120 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.011 | -0.416 | 0.299 | 0.131 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.322 | 0.139 | 0.028 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.470 | 0.455 | 0.097 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.007 | -0.384 | 0.374 | 0.125 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.467 | 0.428 | 0.109 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.348 | 0.279 | 0.126 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.873 | 0.618 | 1.060 | 0.070 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.242 | 0.278 | 0.098 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.549 | 0.437 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.053 | -0.174 | 0.127 | 0.058 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.469 | 0.517 | 0.124 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.133 | 0.187 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.548 | 0.557 | 0.125 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | -0.011 | -0.339 | 0.303 | 0.116 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.960 | 0.744 | 1.153 | 0.095 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.004 | -0.302 | 0.238 | 0.099 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.567 | 0.133 | 0.032 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.425 | 0.414 | 0.087 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | 0.001 | -0.419 | 0.485 | 0.116 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.429 | 0.385 | 0.095 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.398 | 0.287 | 0.123 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 0.909 | 0.770 | 1.090 | 0.066 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.204 | 0.175 | 0.073 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.451 | 0.462 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.069 | -0.268 | 0.143 | 0.077 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.488 | 0.602 | 0.126 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.004 | -0.179 | 0.114 | 0.050 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.480 | 0.466 | 0.118 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.007 | -0.358 | 0.225 | 0.102 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.274 | 0.457 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.532 | 0.438 | 0.200 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.098 | 0.115 | 0.025 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | 0.002 | -0.033 | 0.041 | 0.015 | torch.Size([120]) || stage3.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.010 | -0.030 | 0.017 | 0.010 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.000 | -0.078 | 0.069 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.006 | -0.055 | 0.067 | 0.026 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | -0.001 | -0.071 | 0.067 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | 0.004 | -0.070 | 0.113 | 0.042 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage3.pa_deform.conv_offset.6.bias + | 0.004 | -0.623 | 0.669 | 0.126 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.092 | -0.221 | 0.676 | 0.151 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | 0.000 | -0.604 | 0.689 | 0.125 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.008 | -0.544 | 0.379 | 0.118 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | 0.000 | -0.669 | 0.719 | 0.151 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.005 | -0.411 | 0.443 | 0.155 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 1.005 | 0.488 | 1.503 | 0.166 | torch.Size([480]) || stage4.reshape.1.weight + | 0.001 | -0.316 | 0.358 | 0.118 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.486 | 0.450 | 0.084 | torch.Size([120, 480]) || stage4.reshape.2.weight + | -0.007 | -0.139 | 0.092 | 0.043 | torch.Size([120]) || stage4.reshape.2.bias + | 0.996 | 0.831 | 1.101 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.109 | 0.112 | 0.040 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.064 | 0.064 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.109 | 0.107 | 0.023 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.001 | -0.033 | 0.029 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.256 | 0.235 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.007 | -0.099 | 0.227 | 0.051 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.129 | 0.142 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.035 | 0.029 | 0.006 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.966 | 0.869 | 1.089 | 0.041 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.155 | 0.152 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.248 | 0.221 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | -0.002 | -0.066 | 0.012 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.287 | 0.219 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.085 | 0.067 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.256 | 0.235 | 0.025 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.009 | -0.123 | 0.254 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 0.988 | 0.825 | 1.079 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.013 | -0.123 | 0.105 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.081 | 0.078 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.133 | 0.170 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.053 | 0.048 | 0.014 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.177 | 0.174 | 0.031 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | 0.008 | -0.099 | 0.204 | 0.048 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.138 | 0.130 | 0.026 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.061 | 0.059 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.996 | 0.943 | 1.081 | 0.026 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.001 | -0.064 | 0.051 | 0.027 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.336 | 0.268 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.029 | 0.028 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.223 | 0.272 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.084 | 0.037 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.207 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.007 | -0.140 | 0.216 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.855 | 1.108 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.019 | -0.115 | 0.091 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.063 | 0.076 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.190 | 0.179 | 0.027 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.001 | -0.043 | 0.039 | 0.011 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.158 | 0.161 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | 0.008 | -0.118 | 0.164 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.213 | 0.211 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.043 | 0.040 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.903 | 1.099 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.097 | 0.106 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.186 | 0.177 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.068 | 0.045 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.307 | 0.185 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | -0.000 | -0.081 | 0.061 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.195 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.008 | -0.115 | 0.161 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 0.997 | 0.893 | 1.071 | 0.032 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.019 | -0.083 | 0.047 | 0.024 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.076 | 0.073 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.275 | 0.259 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.071 | 0.066 | 0.017 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.166 | 0.157 | 0.028 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | 0.008 | -0.105 | 0.149 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.184 | 0.197 | 0.028 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.042 | 0.050 | 0.008 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.001 | 0.971 | 1.136 | 0.022 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | -0.002 | -0.054 | 0.050 | 0.023 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.329 | 0.210 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.078 | 0.029 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.234 | 0.241 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.024 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.169 | 0.164 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.007 | -0.085 | 0.114 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.901 | 1.099 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.034 | -0.095 | 0.039 | 0.030 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.071 | 0.090 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.238 | 0.268 | 0.034 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.199 | 0.144 | 0.030 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.167 | 0.218 | 0.029 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | 0.008 | -0.089 | 0.140 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.267 | 0.253 | 0.031 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.067 | 0.069 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.004 | 0.953 | 1.056 | 0.014 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | -0.001 | -0.056 | 0.077 | 0.021 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.170 | 0.184 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.037 | 0.027 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.149 | 0.202 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.059 | 0.095 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.145 | 0.181 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | 0.006 | -0.086 | 0.117 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 0.996 | 0.859 | 1.077 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.153 | 0.009 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.087 | 0.083 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.249 | 0.266 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.199 | 0.168 | 0.031 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.156 | 0.142 | 0.027 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.102 | 0.145 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.299 | 0.376 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.034 | 0.066 | 0.007 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.992 | 0.924 | 1.097 | 0.025 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.002 | -0.089 | 0.074 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.192 | 0.208 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | -0.002 | -0.064 | 0.021 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.240 | 0.191 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.040 | 0.044 | 0.008 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.141 | 0.155 | 0.022 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.107 | 0.103 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.286 | 0.303 | 0.059 | torch.Size([120, 120]) || stage4.linear1.weight + | -0.012 | -0.311 | 0.190 | 0.090 | torch.Size([120]) || stage4.linear1.bias + | 1.009 | 0.926 | 1.101 | 0.028 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.036 | 0.048 | 0.015 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.071 | 0.076 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.135 | 0.141 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.023 | 0.021 | 0.007 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.115 | 0.121 | 0.025 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.200 | 0.098 | 0.043 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.002 | 0.999 | 1.016 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.082 | 0.094 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | 0.000 | -0.005 | 0.017 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.010 | 0.008 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.090 | 0.105 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.181 | 0.096 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.006 | 0.923 | 1.098 | 0.025 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.045 | 0.053 | 0.019 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.083 | 0.085 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.132 | 0.133 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.030 | 0.035 | 0.009 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.129 | 0.094 | 0.024 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | -0.008 | -0.218 | 0.116 | 0.048 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.003 | 0.999 | 1.024 | 0.003 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.126 | 0.080 | 0.021 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | 0.001 | -0.006 | 0.016 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.015 | 0.013 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.091 | 0.115 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | -0.006 | -0.196 | 0.090 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.291 | 0.416 | 0.059 | torch.Size([120, 120]) || stage4.linear2.weight + | -0.009 | -0.269 | 0.198 | 0.094 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.053 | 0.057 | 0.019 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | -0.001 | -0.021 | 0.021 | 0.009 | torch.Size([120]) || stage4.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.000 | -0.015 | 0.015 | 0.009 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | -0.000 | -0.039 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.000 | -0.030 | 0.029 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | -0.000 | -0.045 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.031 | 0.030 | 0.016 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage4.pa_deform.conv_offset.6.bias + | -0.000 | -0.356 | 0.435 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.003 | -0.080 | 0.304 | 0.033 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.361 | 0.436 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | -0.001 | -0.166 | 0.299 | 0.032 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | -0.000 | -0.748 | 0.752 | 0.056 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.000 | -0.262 | 0.270 | 0.086 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.980 | 0.710 | 1.274 | 0.146 | torch.Size([30]) || stage5.reshape.1.weight + | -0.002 | -0.062 | 0.057 | 0.036 | torch.Size([30]) || stage5.reshape.1.bias + | 0.001 | -0.530 | 0.432 | 0.092 | torch.Size([120, 30]) || stage5.reshape.2.weight + | 0.021 | -0.305 | 0.337 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 0.994 | 0.934 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.040 | 0.038 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.082 | 0.072 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.078 | 0.101 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.022 | 0.023 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.198 | 0.237 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.003 | -0.067 | 0.082 | 0.027 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.103 | 0.092 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.991 | 0.929 | 1.004 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.001 | -0.009 | 0.014 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.112 | 0.093 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | -0.001 | -0.033 | 0.027 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.098 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.033 | 0.026 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.163 | 0.140 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.003 | -0.060 | 0.110 | 0.032 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 0.992 | 0.872 | 1.010 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.015 | -0.039 | 0.031 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.088 | 0.099 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.030 | 0.030 | 0.006 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.151 | 0.185 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | -0.005 | -0.073 | 0.061 | 0.024 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.093 | 0.089 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.997 | 0.923 | 1.003 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.008 | 0.009 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.082 | 0.092 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.023 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.082 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.028 | 0.025 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.090 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.062 | 0.102 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.845 | 1.015 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.018 | -0.045 | 0.016 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.065 | 0.068 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.088 | 0.113 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.022 | 0.020 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.124 | 0.124 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.061 | 0.049 | 0.020 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.087 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.005 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.847 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.014 | 0.015 | 0.007 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.096 | 0.096 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | 0.001 | -0.038 | 0.027 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.090 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.045 | 0.039 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.153 | 0.130 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.097 | 0.083 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 0.984 | 0.798 | 1.006 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.018 | -0.042 | 0.003 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.074 | 0.214 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.133 | 0.132 | 0.022 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.035 | 0.037 | 0.008 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.121 | 0.123 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.043 | 0.049 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.082 | 0.093 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.993 | 0.809 | 1.008 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.001 | -0.018 | 0.013 | 0.006 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.100 | 0.097 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.038 | 0.045 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.104 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.043 | 0.040 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.108 | 0.121 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | 0.002 | -0.066 | 0.048 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 0.988 | 0.835 | 1.035 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.022 | -0.052 | 0.003 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.086 | 0.118 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.199 | 0.223 | 0.023 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.045 | 0.028 | 0.009 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.114 | 0.143 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.060 | 0.047 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.117 | 0.102 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.994 | 0.774 | 1.007 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.023 | 0.027 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.085 | 0.107 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | 0.003 | -0.044 | 0.042 | 0.013 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.103 | 0.080 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.067 | 0.058 | 0.015 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.096 | 0.103 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.045 | 0.054 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 0.985 | 0.552 | 1.092 | 0.044 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.023 | -0.073 | 0.024 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.080 | 0.121 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | -0.000 | -1.776 | 0.186 | 0.026 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.070 | 0.065 | 0.015 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.230 | 0.359 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | -0.001 | -0.062 | 0.079 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.086 | 0.104 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.976 | 0.863 | 0.995 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.037 | 0.053 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.121 | 0.100 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | 0.009 | -0.074 | 0.101 | 0.021 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.102 | 0.101 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.092 | 0.082 | 0.028 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.148 | 0.202 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.056 | 0.054 | 0.025 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.139 | 0.123 | 0.024 | torch.Size([120, 120]) || stage5.linear1.weight + | 0.022 | -0.317 | 0.336 | 0.081 | torch.Size([120]) || stage5.linear1.bias + | 0.963 | 0.765 | 1.026 | 0.058 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.315 | 0.286 | 0.078 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.159 | 0.119 | 0.022 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.038 | 0.044 | 0.013 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.134 | 0.126 | 0.024 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.005 | -0.263 | 0.230 | 0.060 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.990 | 0.913 | 1.001 | 0.017 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.009 | 0.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.077 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.004 | -0.025 | 0.016 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.073 | 0.090 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.018 | 0.018 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.264 | 0.273 | 0.056 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.976 | 0.733 | 1.048 | 0.053 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.265 | 0.241 | 0.061 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.079 | 0.081 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.145 | 0.145 | 0.023 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.031 | 0.051 | 0.009 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.114 | 0.103 | 0.025 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.166 | 0.119 | 0.032 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.993 | 0.939 | 1.001 | 0.012 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.011 | 0.008 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.002 | -0.026 | 0.020 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.020 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | -0.016 | -0.224 | 0.158 | 0.041 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.244 | 0.248 | 0.044 | torch.Size([120, 120]) || stage5.linear2.weight + | 0.022 | -0.367 | 0.377 | 0.103 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.153 | 0.112 | 0.022 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | -0.004 | -0.061 | 0.053 | 0.023 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.022 | 0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.081 | 0.076 | 0.020 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | -0.008 | -0.062 | 0.031 | 0.021 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | -0.000 | -0.080 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.057 | 0.035 | 0.020 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.590 | 0.536 | 0.063 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.075 | -0.075 | 0.431 | 0.094 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | 0.000 | -0.704 | 0.718 | 0.064 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | 0.005 | -0.308 | 0.337 | 0.073 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | 0.000 | -0.702 | 0.735 | 0.101 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.005 | -0.422 | 0.451 | 0.157 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.444 | 1.141 | 1.615 | 0.121 | torch.Size([30]) || stage6.reshape.1.weight + | -0.003 | -0.150 | 0.115 | 0.074 | torch.Size([30]) || stage6.reshape.1.bias + | 0.001 | -0.848 | 0.822 | 0.232 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.004 | -0.514 | 0.640 | 0.181 | torch.Size([120]) || stage6.reshape.2.bias + | 0.557 | 0.119 | 0.895 | 0.153 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.070 | -0.374 | 0.181 | 0.100 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | 0.001 | -0.438 | 0.141 | 0.054 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.339 | 0.306 | 0.051 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.005 | -0.318 | 0.257 | 0.059 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.473 | 0.491 | 0.061 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.330 | 0.253 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.361 | 0.307 | 0.045 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.044 | 0.053 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.521 | 0.121 | 0.882 | 0.143 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.212 | 0.271 | 0.104 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.360 | 0.360 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.095 | -0.280 | 0.021 | 0.059 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.354 | 0.331 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.005 | -0.196 | 0.129 | 0.048 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.486 | 0.379 | 0.080 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.001 | -0.154 | 0.154 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.587 | 0.200 | 0.865 | 0.122 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.118 | -0.374 | 0.082 | 0.089 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.423 | 0.140 | 0.050 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.315 | 0.354 | 0.057 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.184 | 0.148 | 0.047 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.626 | 0.422 | 0.060 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | 0.004 | -0.234 | 0.187 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.692 | 0.743 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.038 | 0.041 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.590 | 0.287 | 0.942 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | -0.006 | -0.196 | 0.203 | 0.076 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.427 | 0.431 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.080 | -0.242 | 0.033 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.293 | 0.362 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | 0.001 | -0.171 | 0.207 | 0.047 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.423 | 0.467 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.152 | 0.184 | 0.057 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.703 | 0.255 | 1.008 | 0.132 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.125 | -0.342 | 0.042 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.381 | 0.350 | 0.052 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.426 | 0.500 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | -0.003 | -0.262 | 0.226 | 0.054 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.299 | 0.325 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.149 | 0.096 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.406 | 0.391 | 0.055 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.055 | 0.085 | 0.015 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.666 | 0.308 | 0.942 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | -0.005 | -0.203 | 0.265 | 0.086 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.349 | 0.494 | 0.072 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.071 | -0.213 | 0.071 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.294 | 0.408 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.120 | 0.147 | 0.049 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.303 | 0.304 | 0.073 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | -0.005 | -0.150 | 0.129 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.702 | 0.307 | 0.960 | 0.129 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.100 | -0.262 | 0.057 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.501 | 0.290 | 0.062 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.349 | 0.336 | 0.061 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.287 | 0.202 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.322 | 0.401 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.004 | -0.182 | 0.151 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.441 | 0.444 | 0.054 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.038 | 0.033 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.666 | 0.317 | 0.970 | 0.117 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | -0.003 | -0.173 | 0.168 | 0.067 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.354 | 0.408 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.072 | -0.297 | 0.067 | 0.065 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.299 | 0.335 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | -0.004 | -0.191 | 0.136 | 0.060 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.400 | 0.590 | 0.071 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | -0.005 | -0.159 | 0.142 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.730 | 0.334 | 0.963 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.064 | -0.201 | 0.064 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.702 | 1.180 | 0.086 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.483 | 0.398 | 0.073 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | 0.004 | -0.480 | 0.514 | 0.080 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.331 | 0.390 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | -0.004 | -0.141 | 0.167 | 0.050 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.387 | 0.470 | 0.048 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.065 | 0.039 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.656 | 0.235 | 0.874 | 0.105 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.005 | -0.237 | 0.171 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.440 | 0.483 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.076 | -0.347 | 0.110 | 0.076 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.286 | 0.348 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.001 | -0.189 | 0.169 | 0.069 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.398 | 0.336 | 0.075 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | -0.004 | -0.127 | 0.137 | 0.052 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.691 | 0.178 | 0.975 | 0.116 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.042 | -0.137 | 0.099 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.662 | 1.078 | 0.078 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.359 | 0.531 | 0.072 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.293 | 0.311 | 0.075 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.426 | 0.488 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | -0.006 | -0.103 | 0.159 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.401 | 0.385 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.039 | 0.043 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.607 | 0.210 | 0.802 | 0.094 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | -0.004 | -0.178 | 0.199 | 0.068 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.377 | 0.541 | 0.079 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.069 | -0.429 | 0.280 | 0.096 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.394 | 0.344 | 0.077 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.241 | 0.223 | 0.085 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.527 | 0.647 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | -0.006 | -0.126 | 0.157 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.001 | -0.294 | 0.287 | 0.060 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.006 | -0.543 | 0.664 | 0.193 | torch.Size([120]) || stage6.linear1.bias + | 0.674 | 0.222 | 1.065 | 0.154 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.480 | 0.311 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.629 | 0.461 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.495 | 0.440 | 0.085 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.516 | 0.468 | 0.114 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.369 | 0.377 | 0.085 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | -0.003 | -0.297 | 0.292 | 0.113 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.644 | 0.181 | 1.104 | 0.153 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | 0.003 | -0.167 | 0.185 | 0.070 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.383 | 0.534 | 0.087 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.101 | -0.214 | 0.048 | 0.051 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.350 | 0.560 | 0.085 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.159 | 0.138 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.374 | 0.488 | 0.091 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.271 | 0.252 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.663 | 0.353 | 0.959 | 0.106 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.314 | 0.289 | 0.089 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.772 | 0.763 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.495 | 0.604 | 0.086 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.005 | -0.491 | 0.401 | 0.097 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.380 | 0.376 | 0.076 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.007 | -0.321 | 0.234 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.666 | 0.226 | 1.153 | 0.138 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | 0.001 | -0.178 | 0.220 | 0.069 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.514 | 0.608 | 0.090 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.132 | -0.313 | 0.023 | 0.059 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.423 | 0.488 | 0.088 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.153 | 0.122 | 0.053 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.399 | 0.435 | 0.087 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.285 | 0.241 | 0.093 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.308 | 0.365 | 0.070 | torch.Size([120, 120]) || stage6.linear2.weight + | -0.002 | -0.699 | 0.757 | 0.303 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.130 | 0.129 | 0.027 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | -0.001 | -0.051 | 0.045 | 0.018 | torch.Size([120]) || stage6.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.049 | 0.026 | 0.012 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.090 | 0.114 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.008 | -0.070 | 0.060 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.001 | -0.097 | 0.101 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.006 | -0.096 | 0.114 | 0.044 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage6.pa_deform.conv_offset.6.bias + | -0.002 | -0.822 | 0.740 | 0.127 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.212 | -0.394 | 0.913 | 0.216 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | -0.000 | -0.948 | 0.848 | 0.131 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.001 | -0.657 | 0.605 | 0.279 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.678 | 0.823 | 0.158 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.009 | -0.616 | 0.477 | 0.283 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.363 | 1.278 | 1.458 | 0.048 | torch.Size([30]) || stage7.reshape.1.weight + | -0.001 | -0.247 | 0.227 | 0.139 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.590 | 0.587 | 0.179 | torch.Size([120, 30]) || stage7.reshape.2.weight + | -0.029 | -0.525 | 0.546 | 0.231 | torch.Size([120]) || stage7.reshape.2.bias + | 0.406 | 0.101 | 0.864 | 0.138 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.159 | -0.667 | 0.525 | 0.161 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.174 | -2.385 | 4.798 | 0.381 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.809 | 0.687 | 0.111 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.275 | 0.262 | 0.057 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.416 | 0.438 | 0.096 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | 0.008 | -0.499 | 0.295 | 0.131 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.494 | 1.378 | 0.106 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.123 | 0.106 | 0.015 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.284 | 0.172 | 0.377 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.502 | 0.588 | 0.124 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.597 | 0.567 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.061 | -0.420 | 0.409 | 0.104 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.606 | 0.601 | 0.144 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | -0.003 | -0.306 | 0.261 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.572 | 0.609 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | -0.008 | -0.373 | 0.306 | 0.099 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.538 | 0.114 | 0.809 | 0.125 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.129 | -0.865 | 0.532 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.281 | -2.710 | 4.413 | 0.432 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.646 | 0.655 | 0.135 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.301 | 0.303 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.479 | 0.463 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.016 | -0.460 | 0.313 | 0.135 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -2.205 | 2.065 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.074 | 0.085 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.353 | 0.243 | 0.425 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | -0.008 | -0.643 | 0.628 | 0.146 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.535 | 0.617 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.054 | -0.348 | 0.244 | 0.109 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.001 | -0.671 | 0.611 | 0.148 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.004 | -0.272 | 0.292 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.672 | 0.595 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.398 | 0.273 | 0.088 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.581 | 0.093 | 0.791 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.143 | -1.023 | 0.481 | 0.167 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.098 | -2.171 | 4.402 | 0.287 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.640 | 0.701 | 0.147 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.005 | -0.328 | 0.408 | 0.072 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.417 | 0.441 | 0.101 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.007 | -0.508 | 0.265 | 0.127 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -2.511 | 2.484 | 0.143 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.093 | 0.104 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.392 | 0.276 | 0.487 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | -0.016 | -0.555 | 0.581 | 0.143 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.630 | 0.674 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.072 | -0.420 | 0.173 | 0.115 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.654 | 0.793 | 0.152 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.303 | 0.263 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.603 | 0.658 | 0.150 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.003 | -0.301 | 0.247 | 0.081 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.611 | 0.127 | 0.811 | 0.134 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.137 | -0.781 | 0.684 | 0.164 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.109 | -4.577 | 4.527 | 0.332 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.757 | 0.743 | 0.146 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.358 | 0.342 | 0.083 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.465 | 0.447 | 0.097 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.389 | 0.233 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -1.947 | 1.928 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.070 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.410 | 0.283 | 0.489 | 0.035 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | -0.014 | -0.442 | 0.639 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.542 | 0.585 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.069 | -0.463 | 0.214 | 0.122 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.689 | 0.605 | 0.154 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | -0.008 | -0.307 | 0.279 | 0.096 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.593 | 0.603 | 0.152 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.269 | 0.270 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.652 | 0.132 | 0.859 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.131 | -0.662 | 0.729 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.092 | -4.521 | 3.027 | 0.337 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.694 | 0.828 | 0.148 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.328 | 0.361 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.430 | 0.483 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.368 | 0.250 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -1.506 | 1.779 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.090 | 0.112 | 0.020 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.435 | 0.347 | 0.536 | 0.033 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.345 | 0.609 | 0.136 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | -0.001 | -0.580 | 0.558 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.066 | -0.392 | 0.239 | 0.128 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.608 | 0.667 | 0.157 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.001 | -0.276 | 0.296 | 0.105 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.666 | 0.775 | 0.155 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.380 | 0.360 | 0.101 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.648 | 0.269 | 0.885 | 0.109 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.116 | -0.436 | 0.749 | 0.144 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.130 | -3.976 | 4.665 | 0.318 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.702 | 0.671 | 0.140 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.346 | 0.340 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.410 | 0.394 | 0.091 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.006 | -0.286 | 0.244 | 0.100 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.870 | 0.885 | 0.109 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.120 | 0.096 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.445 | 0.326 | 0.595 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | -0.016 | -0.233 | 0.558 | 0.110 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.576 | 0.577 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.038 | -0.525 | 0.269 | 0.139 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.672 | 0.671 | 0.158 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.400 | 0.281 | 0.116 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.937 | 0.714 | 0.156 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.007 | -0.435 | 0.876 | 0.188 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.234 | 0.212 | 0.056 | torch.Size([120, 120]) || stage7.linear1.weight + | -0.033 | -0.655 | 0.586 | 0.242 | torch.Size([120]) || stage7.linear1.bias + | 0.684 | 0.257 | 0.867 | 0.090 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | -0.003 | -0.857 | 0.829 | 0.193 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.005 | -5.628 | 1.358 | 0.121 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.699 | 0.827 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.821 | 0.662 | 0.143 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.392 | 0.418 | 0.106 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | 0.003 | -0.147 | 0.171 | 0.052 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.431 | 0.316 | 0.521 | 0.036 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | -0.003 | -0.595 | 0.673 | 0.129 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.701 | 0.542 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | 0.017 | -0.290 | 0.421 | 0.117 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.603 | 0.637 | 0.145 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.394 | 0.426 | 0.098 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.602 | 0.607 | 0.144 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.460 | 0.272 | 0.112 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.655 | 0.251 | 0.779 | 0.074 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | -0.004 | -0.718 | 0.811 | 0.153 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.007 | -3.104 | 1.224 | 0.101 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.664 | 0.647 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.532 | 0.746 | 0.150 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.428 | 0.360 | 0.100 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.244 | 0.242 | 0.063 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.442 | 0.284 | 0.530 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | -0.004 | -0.421 | 0.664 | 0.106 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | -0.001 | -0.604 | 0.583 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | 0.028 | -0.389 | 0.406 | 0.134 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | -0.001 | -0.681 | 0.818 | 0.148 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | 0.003 | -0.247 | 0.361 | 0.096 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.783 | 0.835 | 0.146 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | 0.008 | -0.529 | 0.922 | 0.144 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.353 | 0.277 | 0.071 | torch.Size([120, 120]) || stage7.linear2.weight + | -0.026 | -0.905 | 0.749 | 0.262 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.125 | 0.138 | 0.027 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | -0.003 | -0.091 | 0.071 | 0.030 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.000 | -0.028 | 0.054 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.130 | 0.111 | 0.017 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.004 | -0.105 | 0.094 | 0.040 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.203 | 0.124 | 0.016 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.027 | -0.097 | 0.151 | 0.048 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage7.pa_deform.conv_offset.6.bias + | -0.002 | -0.997 | 1.031 | 0.156 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.219 | -0.261 | 0.769 | 0.213 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.001 | -1.119 | 1.206 | 0.175 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.011 | -0.547 | 0.598 | 0.195 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.000 | -0.860 | 0.957 | 0.160 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | 0.018 | -1.017 | 0.731 | 0.363 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.491 | 1.080 | 1.847 | 0.135 | torch.Size([120]) || stage8.0.1.weight + | -0.012 | -0.370 | 0.414 | 0.140 | torch.Size([120]) || stage8.0.1.bias + | -0.000 | -0.882 | 1.114 | 0.177 | torch.Size([180, 120]) || stage8.0.2.weight + | -0.005 | -1.101 | 0.699 | 0.167 | torch.Size([180]) || stage8.0.2.bias + | 0.622 | 0.186 | 1.009 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.006 | -0.884 | 1.056 | 0.212 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.003 | -2.578 | 2.238 | 0.223 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.042 | 1.335 | 0.152 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.992 | 0.938 | 0.208 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.692 | 0.565 | 0.129 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | 0.009 | -1.288 | 0.895 | 0.185 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.415 | 0.180 | 0.539 | 0.066 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.006 | -0.634 | 0.818 | 0.145 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | 0.001 | -0.969 | 0.867 | 0.145 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.055 | -0.545 | 0.271 | 0.110 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.698 | 0.845 | 0.153 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.007 | -0.526 | 0.444 | 0.126 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.812 | 0.874 | 0.155 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | 0.009 | -0.468 | 0.864 | 0.160 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.724 | 0.198 | 0.915 | 0.128 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.003 | -1.026 | 0.953 | 0.209 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | 0.030 | -3.042 | 1.112 | 0.227 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.192 | 0.952 | 0.169 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | -0.009 | -1.186 | 0.822 | 0.191 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.500 | 0.647 | 0.121 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.892 | 1.020 | 0.208 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.492 | 0.230 | 0.628 | 0.064 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.006 | -0.853 | 0.872 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | 0.001 | -0.748 | 0.701 | 0.150 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.055 | -0.409 | 0.305 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.806 | 0.662 | 0.155 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.304 | 0.419 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.841 | 0.781 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | 0.005 | -0.280 | 0.641 | 0.119 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.803 | 0.314 | 1.038 | 0.110 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.006 | -1.202 | 1.119 | 0.207 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.002 | -2.783 | 1.481 | 0.236 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.957 | 0.943 | 0.162 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.519 | 0.526 | 0.136 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.543 | 0.516 | 0.117 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | 0.005 | -0.711 | 0.838 | 0.184 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.549 | 0.206 | 0.679 | 0.078 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.005 | -0.888 | 0.879 | 0.154 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | 0.000 | -0.748 | 0.896 | 0.148 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.073 | -0.478 | 0.193 | 0.098 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.628 | 0.674 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | -0.001 | -0.331 | 0.230 | 0.082 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.677 | 0.673 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | 0.004 | -0.294 | 0.745 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.843 | 0.308 | 0.966 | 0.094 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.002 | -1.222 | 1.324 | 0.192 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | 0.001 | -2.899 | 2.240 | 0.272 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.999 | 0.935 | 0.167 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.612 | 0.531 | 0.127 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.591 | 0.537 | 0.112 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.005 | -0.476 | 1.034 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.534 | 0.198 | 0.660 | 0.074 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.006 | -0.845 | 0.869 | 0.130 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | 0.001 | -0.649 | 0.677 | 0.147 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.080 | -0.378 | 0.228 | 0.109 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.628 | 0.683 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.005 | -0.300 | 0.222 | 0.083 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.959 | 0.733 | 0.153 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | 0.003 | -0.915 | 0.961 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.001 | -0.411 | 0.533 | 0.070 | torch.Size([180, 180]) || stage8.1.linear.weight + | -0.004 | -0.907 | 0.257 | 0.135 | torch.Size([180]) || stage8.1.linear.bias + | 0.890 | 0.143 | 1.178 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.034 | -0.781 | 0.959 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | 0.001 | -2.545 | 1.182 | 0.186 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.151 | 1.199 | 0.158 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.731 | 0.744 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.522 | 0.577 | 0.131 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | 0.003 | -0.537 | 0.895 | 0.164 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.599 | 0.203 | 0.779 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.021 | -0.429 | 1.016 | 0.143 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.000 | -0.914 | 0.736 | 0.145 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.054 | -0.545 | 0.183 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.716 | 0.750 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.254 | 0.408 | 0.085 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.842 | 0.706 | 0.153 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | 0.001 | -0.277 | 0.365 | 0.093 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.910 | 0.151 | 1.164 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.032 | -0.801 | 1.151 | 0.191 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.069 | -2.776 | 5.771 | 0.290 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.359 | 1.101 | 0.156 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.009 | -0.624 | 0.654 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.565 | 0.575 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.004 | -0.671 | 0.566 | 0.171 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.609 | 0.206 | 0.818 | 0.109 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.022 | -0.474 | 1.079 | 0.147 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | 0.000 | -0.760 | 0.819 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.045 | -0.414 | 0.277 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.831 | 0.809 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.002 | -0.544 | 0.244 | 0.082 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.749 | 0.962 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | 0.011 | -0.275 | 0.294 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.990 | 0.168 | 1.270 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.034 | -0.773 | 1.134 | 0.182 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.070 | -2.190 | 5.577 | 0.255 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.004 | 1.113 | 0.152 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.781 | 0.551 | 0.137 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.580 | 0.572 | 0.141 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.001 | -0.554 | 0.820 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.642 | 0.178 | 0.852 | 0.111 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.025 | -0.413 | 0.853 | 0.124 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.000 | -0.780 | 1.141 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.067 | -0.860 | 0.177 | 0.114 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -1.067 | 0.859 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.002 | -0.298 | 0.225 | 0.072 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.726 | 0.809 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.394 | 0.292 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.990 | 0.219 | 1.226 | 0.130 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.032 | -0.837 | 1.156 | 0.168 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.005 | -4.045 | 1.695 | 0.178 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.855 | 1.101 | 0.153 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.706 | 0.841 | 0.123 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.586 | 0.699 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | 0.001 | -0.402 | 0.842 | 0.173 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.613 | 0.196 | 0.800 | 0.102 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.021 | -0.404 | 0.907 | 0.115 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | 0.000 | -0.718 | 0.654 | 0.138 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.064 | -0.568 | 0.205 | 0.115 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.001 | -0.674 | 0.596 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | -0.012 | -0.279 | 0.171 | 0.073 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.634 | 0.692 | 0.150 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | 0.010 | -0.528 | 1.331 | 0.175 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.361 | 0.549 | 0.078 | torch.Size([180, 180]) || stage8.2.linear.weight + | -0.001 | -0.682 | 0.349 | 0.142 | torch.Size([180]) || stage8.2.linear.bias + | 1.018 | 0.177 | 1.365 | 0.177 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.033 | -0.673 | 0.916 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | 0.003 | -2.963 | 1.620 | 0.138 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -1.095 | 0.939 | 0.152 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | 0.004 | -0.725 | 0.682 | 0.135 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.731 | 0.755 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | 0.013 | -0.457 | 0.481 | 0.158 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.703 | 0.276 | 0.865 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.024 | -0.449 | 0.966 | 0.132 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.001 | -0.873 | 0.665 | 0.138 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.052 | -0.479 | 0.198 | 0.104 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.787 | 0.699 | 0.155 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.003 | -0.436 | 0.264 | 0.081 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.675 | 0.689 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.265 | 0.254 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.956 | 0.184 | 1.255 | 0.167 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.036 | -0.699 | 0.965 | 0.155 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.038 | -3.913 | 4.625 | 0.210 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.142 | 0.934 | 0.147 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.000 | -0.708 | 0.560 | 0.117 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | -0.002 | -0.746 | 0.626 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | 0.021 | -0.378 | 0.376 | 0.127 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.741 | 0.282 | 0.933 | 0.107 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.028 | -0.425 | 0.898 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.001 | -0.761 | 0.822 | 0.139 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.057 | -0.502 | 0.219 | 0.100 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.829 | 0.872 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.262 | 0.226 | 0.077 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.797 | 0.765 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.002 | -0.360 | 0.289 | 0.109 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 1.068 | 0.207 | 1.335 | 0.160 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.034 | -0.784 | 1.005 | 0.163 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.004 | -2.897 | 1.185 | 0.143 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -1.055 | 0.899 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | -0.000 | -0.572 | 0.670 | 0.120 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.729 | 0.798 | 0.156 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | 0.025 | -0.570 | 0.501 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.759 | 0.228 | 0.969 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.025 | -0.394 | 0.791 | 0.103 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.001 | -0.962 | 0.903 | 0.137 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.064 | -0.587 | 0.209 | 0.108 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.966 | 0.925 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.004 | -0.366 | 0.239 | 0.074 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.782 | 0.817 | 0.152 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | 0.003 | -0.321 | 0.340 | 0.117 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 1.082 | 0.237 | 1.309 | 0.144 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.031 | -0.726 | 0.933 | 0.149 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | 0.005 | -3.023 | 1.093 | 0.142 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.830 | 0.867 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.487 | 0.710 | 0.107 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.940 | 0.725 | 0.157 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | 0.027 | -0.522 | 0.807 | 0.170 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.705 | 0.249 | 0.868 | 0.095 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.023 | -0.426 | 0.826 | 0.108 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.000 | -0.814 | 0.927 | 0.131 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.043 | -0.613 | 0.209 | 0.116 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.709 | 0.851 | 0.154 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.225 | 0.241 | 0.078 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.857 | 0.845 | 0.151 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | 0.016 | -0.441 | 1.206 | 0.183 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.437 | 0.634 | 0.077 | torch.Size([180, 180]) || stage8.3.linear.weight + | -0.003 | -0.564 | 0.338 | 0.145 | torch.Size([180]) || stage8.3.linear.bias + | 1.164 | 0.238 | 1.496 | 0.205 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.033 | -0.667 | 0.780 | 0.170 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.002 | -3.025 | 1.339 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.736 | 0.735 | 0.147 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.468 | 0.575 | 0.112 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.725 | 0.750 | 0.162 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.004 | -0.461 | 0.540 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.804 | 0.361 | 0.962 | 0.091 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.025 | -0.421 | 0.837 | 0.127 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.664 | 0.869 | 0.129 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.028 | -0.519 | 0.180 | 0.098 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.793 | 0.821 | 0.156 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.235 | 0.329 | 0.081 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.758 | 0.730 | 0.153 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | 0.010 | -0.332 | 0.306 | 0.118 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.097 | 0.202 | 1.361 | 0.200 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.034 | -0.597 | 0.687 | 0.147 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.007 | -4.645 | 1.140 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.002 | 0.810 | 0.144 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.407 | 0.438 | 0.108 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.646 | 0.678 | 0.154 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.418 | 0.415 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.836 | 0.316 | 1.026 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.024 | -0.364 | 0.851 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.690 | 0.848 | 0.128 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.032 | -0.484 | 0.195 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.863 | 0.768 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | -0.001 | -0.319 | 0.409 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.836 | 0.822 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | 0.019 | -0.356 | 0.374 | 0.129 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.151 | 0.229 | 1.393 | 0.176 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.028 | -0.649 | 0.925 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | -0.005 | -3.864 | 1.138 | 0.140 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.813 | 0.897 | 0.146 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.449 | 0.486 | 0.103 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.739 | 0.710 | 0.175 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.000 | -0.542 | 0.407 | 0.162 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.820 | 0.329 | 0.989 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.025 | -0.461 | 0.753 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.001 | -0.648 | 0.788 | 0.125 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.015 | -0.501 | 0.248 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.745 | 0.796 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | 0.007 | -0.244 | 0.231 | 0.080 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.771 | 1.049 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | 0.018 | -0.360 | 0.336 | 0.143 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.177 | 0.269 | 1.385 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.028 | -0.700 | 0.877 | 0.145 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | -0.005 | -2.684 | 0.830 | 0.097 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.996 | 0.727 | 0.142 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.326 | 0.449 | 0.101 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.777 | 0.785 | 0.170 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | 0.004 | -0.396 | 0.449 | 0.158 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.790 | 0.392 | 1.005 | 0.078 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.030 | -0.481 | 0.719 | 0.110 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.569 | 0.732 | 0.121 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | 0.020 | -0.670 | 0.335 | 0.125 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.822 | 0.831 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.282 | 0.296 | 0.089 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.856 | 0.886 | 0.155 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | 0.029 | -0.390 | 0.437 | 0.161 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.490 | 0.625 | 0.079 | torch.Size([180, 180]) || stage8.4.linear.weight + | -0.002 | -0.573 | 0.398 | 0.168 | torch.Size([180]) || stage8.4.linear.bias + | 1.337 | 0.163 | 1.694 | 0.268 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.025 | -0.727 | 1.008 | 0.186 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.738 | -2.885 | 5.812 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.852 | 0.854 | 0.135 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | -0.005 | -0.546 | 0.550 | 0.112 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.901 | 0.781 | 0.195 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.020 | -0.545 | 0.469 | 0.173 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.956 | 0.367 | 1.185 | 0.129 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.033 | -0.519 | 0.833 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.001 | -0.832 | 0.580 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | 0.055 | -0.256 | 0.378 | 0.097 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -1.058 | 0.859 | 0.154 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.377 | 0.318 | 0.093 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | -0.001 | -0.751 | 0.766 | 0.156 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.011 | -0.316 | 0.323 | 0.132 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.346 | 0.151 | 1.746 | 0.272 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.023 | -0.691 | 0.993 | 0.169 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.705 | -2.997 | 4.745 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.911 | 0.984 | 0.141 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | -0.011 | -0.405 | 0.288 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.853 | 0.977 | 0.210 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.516 | 0.596 | 0.170 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 1.021 | 0.333 | 1.268 | 0.154 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.034 | -0.512 | 0.812 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | 0.000 | -0.561 | 0.546 | 0.120 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | 0.050 | -0.450 | 0.320 | 0.100 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.001 | -0.907 | 0.752 | 0.157 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | -0.008 | -0.306 | 0.343 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.891 | 0.741 | 0.158 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.014 | -0.407 | 0.478 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.266 | 0.195 | 1.640 | 0.251 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.028 | -0.680 | 0.987 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.515 | -2.839 | 4.668 | 0.636 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | 0.001 | -0.968 | 0.890 | 0.144 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.372 | 0.390 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -1.001 | 0.995 | 0.221 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.012 | -0.576 | 0.456 | 0.172 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 1.046 | 0.311 | 1.264 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.033 | -0.519 | 0.785 | 0.123 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | 0.000 | -0.533 | 0.563 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | 0.053 | -0.314 | 0.364 | 0.109 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.862 | 0.822 | 0.158 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.266 | 0.289 | 0.084 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.787 | 0.886 | 0.161 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.421 | 0.503 | 0.171 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.226 | 0.277 | 1.561 | 0.208 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.032 | -0.670 | 1.030 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.401 | -1.953 | 3.930 | 0.598 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.857 | 0.754 | 0.139 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.317 | 0.278 | 0.081 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | -0.002 | -1.022 | 0.999 | 0.200 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.009 | -0.384 | 0.393 | 0.165 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 1.038 | 0.340 | 1.216 | 0.128 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.034 | -0.574 | 0.775 | 0.124 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | 0.001 | -0.588 | 0.613 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | 0.063 | -0.447 | 0.307 | 0.111 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.873 | 0.775 | 0.159 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.456 | 0.435 | 0.092 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.819 | 0.772 | 0.160 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.018 | -0.319 | 0.340 | 0.131 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.562 | 0.471 | 0.080 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.024 | -0.609 | 0.488 | 0.184 | torch.Size([180]) || stage8.5.linear.bias + | 1.369 | 0.171 | 1.961 | 0.355 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.028 | -0.642 | 0.733 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.029 | -1.759 | 1.624 | 0.312 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.686 | 0.691 | 0.113 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.261 | 0.301 | 0.081 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.736 | 0.637 | 0.149 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | -0.006 | -0.293 | 0.300 | 0.106 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 1.302 | 0.401 | 1.613 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.029 | -0.475 | 0.696 | 0.159 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.001 | -0.649 | 0.564 | 0.119 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.036 | -0.275 | 0.218 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.717 | 0.831 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.231 | 0.270 | 0.074 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.833 | 0.791 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.364 | 0.324 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.450 | 0.218 | 1.962 | 0.354 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.025 | -0.716 | 0.851 | 0.206 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.045 | -1.549 | 2.100 | 0.321 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.759 | 0.636 | 0.110 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.235 | 0.269 | 0.070 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.691 | 0.657 | 0.145 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | -0.007 | -0.375 | 0.328 | 0.116 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 1.326 | 0.335 | 1.596 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.029 | -0.566 | 0.748 | 0.160 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.002 | -0.667 | 0.591 | 0.121 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.042 | -0.387 | 0.373 | 0.078 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.685 | 0.894 | 0.147 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.000 | -0.353 | 0.326 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.801 | 0.692 | 0.149 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.331 | 0.273 | 0.127 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.416 | 0.215 | 1.819 | 0.303 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.024 | -0.596 | 0.869 | 0.211 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.038 | -2.355 | 1.330 | 0.286 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.964 | 0.732 | 0.112 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.192 | 0.251 | 0.052 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.736 | 0.624 | 0.138 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.376 | 0.254 | 0.119 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.352 | 0.217 | 1.546 | 0.187 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.023 | -0.627 | 0.881 | 0.164 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.001 | -0.616 | 0.688 | 0.122 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.040 | -0.332 | 0.242 | 0.083 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.970 | 0.669 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | 0.006 | -0.333 | 0.371 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.849 | 0.824 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.282 | 0.333 | 0.111 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.346 | 0.206 | 1.798 | 0.286 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.022 | -0.742 | 0.797 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.056 | -1.296 | 2.098 | 0.311 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.693 | 0.597 | 0.103 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | -0.003 | -0.211 | 0.161 | 0.055 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.767 | 0.663 | 0.127 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.011 | -0.269 | 0.169 | 0.072 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 1.329 | 0.247 | 1.544 | 0.183 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.023 | -0.619 | 0.881 | 0.171 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | -0.001 | -0.670 | 0.594 | 0.124 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.052 | -0.262 | 0.275 | 0.073 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.899 | 0.808 | 0.149 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.009 | -0.273 | 0.326 | 0.090 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.773 | 0.930 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | -0.001 | -0.264 | 0.261 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -1.128 | 1.483 | 0.100 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.014 | -0.757 | 0.769 | 0.160 | torch.Size([180]) || stage8.6.linear.bias + | 0.387 | 0.109 | 1.033 | 0.194 | torch.Size([180]) || norm.weight + | -0.006 | -0.754 | 0.773 | 0.142 | torch.Size([180]) || norm.bias + | 0.001 | -0.596 | 0.563 | 0.121 | torch.Size([120, 180]) || conv_after_body.weight + | -0.016 | -0.251 | 0.121 | 0.061 | torch.Size([120]) || conv_after_body.bias + | 0.003 | -1.347 | 1.476 | 0.161 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.090 | -0.847 | 0.182 | 0.193 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.002 | -1.602 | 0.994 | 0.114 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.059 | -0.461 | 0.137 | 0.098 | torch.Size([256]) || upsample.0.bias + | -0.005 | -4.099 | 0.822 | 0.076 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.137 | -0.426 | 0.152 | 0.097 | torch.Size([256]) || upsample.5.bias + | -0.000 | -0.377 | 0.324 | 0.014 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | -0.000 | -0.016 | 0.014 | 0.003 | torch.Size([64]) || upsample.10.bias + | -0.000 | -0.043 | 0.040 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:10:58.452 : task: 003_train_vrt_videosr_bi_vimeo_7frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: model_zoo/vrt/002_VRT_videosr_bi_REDS_16frames.pth + pretrained_netE: None + task: experiments/003_train_vrt_videosr_bi_vimeo_7frames + log: experiments/003_train_vrt_videosr_bi_vimeo_7frames + options: experiments/003_train_vrt_videosr_bi_vimeo_7frames/options + models: experiments/003_train_vrt_videosr_bi_vimeo_7frames/models + images: experiments/003_train_vrt_videosr_bi_vimeo_7frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainVimeoDataset + dataroot_gt: trainsets/vimeo90k + dataroot_lq: trainsets/vimeo90k + meta_info_file: data/meta_info/meta_info_Vimeo90K_train_GT.txt + io_backend:[ + type: disk + ] + num_frame: -1 + gt_size: 256 + interval_list: [1] + random_reverse: True + use_hflip: True + use_rot: True + pad_sequence: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/Vid4/GT + dataroot_lq: testsets/Vid4/BIx4 + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [8, 64, 64] + window_size: [8, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 4 + deformable_groups: 16 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: False + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 32 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/003_train_vrt_videosr_bi_vimeo_7frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:10:58.485 : Number of train images: 64,612, iters: 8,077 +22-03-11 10:11:02.029 : +Networks name: VRT +Params number: 32577991 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(364, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 432, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:11:02.191 : + | mean | min | max | std || shape + | 0.000 | -1.496 | 1.623 | 0.115 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | -0.005 | -1.075 | 0.916 | 0.274 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.656 | 0.699 | 0.067 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.037 | -0.877 | 0.359 | 0.346 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.007 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.063 | -1.264 | 0.752 | 0.323 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.158 | -0.704 | 0.861 | 0.357 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.779 | -1.061 | 1.164 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.148 | 0.161 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.002 | -0.000 | 0.004 | 0.003 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.745 | 0.760 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.019 | -0.848 | 0.359 | 0.331 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.373 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.037 | -1.227 | 0.720 | 0.303 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.009 | -4.425 | 0.539 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.158 | -0.758 | 0.988 | 0.386 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.647 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.777 | -1.211 | 1.152 | 0.550 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | 0.000 | -0.126 | 0.144 | 0.017 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | 0.004 | 0.001 | 0.008 | 0.005 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.938 | 0.872 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.028 | -1.086 | 0.552 | 0.435 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.203 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.022 | -1.298 | 0.715 | 0.312 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.010 | -1.806 | 0.627 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.118 | -0.698 | 0.750 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.014 | -1.277 | 0.337 | 0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.684 | -1.730 | 0.954 | 0.648 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.031 | 0.042 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.010 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.956 | 0.847 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.049 | -1.175 | 0.652 | 0.477 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.892 | 1.180 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.021 | -1.294 | 0.764 | 0.316 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.793 | 0.556 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.123 | -0.717 | 0.737 | 0.335 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.102 | 0.291 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.650 | -1.838 | 0.913 | 0.669 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.032 | 0.039 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.000 | -0.012 | 0.012 | 0.017 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.953 | 0.855 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.009 | -1.001 | 0.584 | 0.427 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.054 | 1.223 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.023 | -1.315 | 0.884 | 0.326 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.786 | 0.534 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.142 | -0.698 | 0.780 | 0.342 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.957 | 0.276 | 0.057 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.653 | -1.854 | 0.943 | 0.677 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.035 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.010 | 0.008 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.918 | 0.865 | 0.087 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.047 | -0.824 | 0.510 | 0.392 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.094 | 1.213 | 0.118 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.029 | -1.319 | 0.938 | 0.330 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.007 | -1.794 | 0.519 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.145 | -0.725 | 0.830 | 0.349 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.008 | -0.766 | 0.275 | 0.052 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.659 | -1.945 | 0.999 | 0.706 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.025 | 0.026 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.014 | 0.001 | 0.027 | 0.018 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.335 | 0.614 | 2.324 | 0.313 | torch.Size([120]) || stage1.reshape.1.weight + | -0.007 | -0.451 | 0.392 | 0.149 | torch.Size([120]) || stage1.reshape.1.bias + | 0.640 | 0.164 | 1.487 | 0.258 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.072 | -1.225 | 0.558 | 0.260 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.295 | -4.200 | 2.891 | 0.402 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | 0.001 | -0.736 | 0.771 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.412 | 0.503 | 0.106 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.711 | 0.595 | 0.091 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.006 | -0.195 | 0.530 | 0.097 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.076 | 1.181 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.228 | 0.294 | 0.059 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.836 | 0.408 | 1.248 | 0.162 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.042 | -0.494 | 0.495 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.003 | -0.889 | 0.982 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | 0.041 | -0.364 | 0.458 | 0.117 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.757 | 0.882 | 0.140 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.400 | 0.470 | 0.157 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.852 | 1.093 | 0.139 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | 0.022 | -0.265 | 0.384 | 0.096 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.894 | 0.195 | 1.588 | 0.211 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.156 | -1.734 | 0.260 | 0.208 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.433 | -4.335 | 2.455 | 0.555 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | -0.001 | -1.631 | 1.615 | 0.174 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | 0.005 | -0.246 | 0.392 | 0.072 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.697 | 0.574 | 0.098 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | 0.011 | -0.191 | 0.529 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.260 | 1.186 | 0.133 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.207 | 0.162 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.725 | 0.421 | 0.899 | 0.072 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.043 | -0.750 | 0.403 | 0.161 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | -0.001 | -0.950 | 0.899 | 0.146 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.001 | -0.381 | 0.301 | 0.092 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.615 | 0.630 | 0.142 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.009 | -0.473 | 0.647 | 0.131 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | 0.001 | -0.789 | 0.813 | 0.146 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | -0.041 | -0.335 | 0.331 | 0.119 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 1.087 | 0.163 | 1.663 | 0.218 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.188 | -1.539 | 0.134 | 0.175 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.505 | -4.230 | 3.070 | 0.545 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | -0.000 | -1.348 | 1.453 | 0.171 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | 0.007 | -0.394 | 0.633 | 0.080 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | 0.001 | -0.561 | 0.466 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | 0.028 | -0.263 | 0.277 | 0.111 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.982 | 1.268 | 0.124 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.139 | 0.149 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.743 | 0.234 | 0.925 | 0.092 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.030 | -1.015 | 0.440 | 0.156 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | -0.002 | -0.956 | 1.234 | 0.155 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | 0.003 | -0.419 | 0.302 | 0.108 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.723 | 0.609 | 0.143 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.362 | 0.529 | 0.129 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.768 | 0.645 | 0.147 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | -0.033 | -0.281 | 0.244 | 0.100 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.076 | 0.178 | 1.503 | 0.199 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.153 | -1.699 | 0.096 | 0.171 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.815 | -4.386 | 4.546 | 0.797 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.001 | -2.332 | 2.215 | 0.164 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.004 | -0.455 | 0.400 | 0.070 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.504 | 0.556 | 0.108 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.006 | -0.339 | 0.365 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -1.444 | 1.191 | 0.122 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.162 | 0.140 | 0.029 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.715 | 0.229 | 0.865 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.026 | -1.011 | 0.287 | 0.151 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | -0.003 | -0.761 | 0.828 | 0.148 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | 0.014 | -0.337 | 0.418 | 0.135 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.716 | 0.712 | 0.149 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | 0.003 | -0.427 | 0.369 | 0.124 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.719 | 0.640 | 0.151 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | -0.010 | -0.557 | 0.227 | 0.103 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.161 | 0.188 | 1.556 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.165 | -1.773 | 0.054 | 0.186 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.575 | -3.741 | 5.261 | 0.767 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.000 | -2.020 | 2.251 | 0.173 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.000 | -0.318 | 0.312 | 0.071 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.463 | 0.456 | 0.112 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.406 | 0.393 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.968 | 1.330 | 0.123 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.152 | 0.176 | 0.030 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.699 | 0.230 | 0.850 | 0.073 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.029 | -1.033 | 0.300 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.002 | -0.718 | 0.803 | 0.145 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | 0.002 | -0.389 | 0.405 | 0.139 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | -0.001 | -0.582 | 0.624 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | 0.003 | -0.385 | 0.386 | 0.118 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.677 | 0.737 | 0.153 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | 0.003 | -0.671 | 0.208 | 0.108 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.067 | 0.173 | 1.473 | 0.179 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.129 | -1.487 | 0.138 | 0.166 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.530 | -3.629 | 3.705 | 0.621 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | 0.000 | -2.344 | 1.768 | 0.157 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.428 | 0.265 | 0.082 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | -0.001 | -0.541 | 0.559 | 0.120 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | 0.031 | -0.324 | 0.379 | 0.133 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | -0.001 | -1.380 | 0.992 | 0.120 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.100 | 0.111 | 0.027 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.637 | 0.273 | 0.780 | 0.064 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.022 | -1.160 | 0.338 | 0.149 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.002 | -0.696 | 0.638 | 0.139 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | 0.007 | -0.366 | 0.364 | 0.134 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.001 | -0.581 | 0.657 | 0.151 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | -0.004 | -0.366 | 0.244 | 0.105 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -1.143 | 0.787 | 0.154 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | 0.023 | -1.254 | 0.407 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.293 | 0.270 | 0.065 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.006 | -0.209 | 0.382 | 0.093 | torch.Size([120]) || stage1.linear1.bias + | 0.811 | 0.432 | 1.092 | 0.108 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.033 | -0.763 | 0.477 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.049 | -2.996 | 1.734 | 0.246 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.847 | 1.215 | 0.150 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.542 | 0.581 | 0.147 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.536 | 0.569 | 0.124 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.004 | -0.195 | 0.602 | 0.102 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.568 | 0.438 | 0.872 | 0.074 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.025 | -0.782 | 0.342 | 0.164 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.003 | -0.601 | 0.699 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.068 | -0.329 | 0.446 | 0.095 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | 0.001 | -0.807 | 0.710 | 0.143 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.585 | 0.392 | 0.117 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.779 | 0.575 | 0.142 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | 0.008 | -0.377 | 0.374 | 0.159 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.942 | 0.411 | 1.171 | 0.093 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.038 | -0.837 | 0.321 | 0.152 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.077 | -2.150 | 2.175 | 0.237 | torch.Size([3375, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.750 | 0.771 | 0.159 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.589 | 0.559 | 0.145 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.478 | 0.525 | 0.125 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.338 | 0.449 | 0.154 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.597 | 0.429 | 0.741 | 0.044 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.038 | -0.697 | 0.195 | 0.103 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.003 | -0.671 | 0.636 | 0.135 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.057 | -0.519 | 0.422 | 0.139 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.629 | 0.607 | 0.153 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.279 | 0.403 | 0.083 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.001 | -0.620 | 0.712 | 0.150 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | 0.014 | -0.721 | 0.333 | 0.163 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.504 | 0.343 | 0.079 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.015 | -0.276 | 0.353 | 0.122 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.151 | 0.136 | 0.025 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.087 | 0.103 | 0.030 | torch.Size([120]) || stage1.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.004 | -0.024 | 0.040 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.122 | 0.123 | 0.017 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.009 | -0.068 | 0.068 | 0.028 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.175 | 0.114 | 0.015 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | 0.019 | -0.059 | 0.110 | 0.042 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage1.pa_deform.conv_offset.6.bias + | -0.001 | -1.034 | 1.208 | 0.150 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.085 | -0.220 | 0.682 | 0.164 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.305 | 1.408 | 0.167 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.005 | -0.474 | 0.521 | 0.147 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.941 | 0.939 | 0.158 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | 0.019 | -0.993 | 0.852 | 0.371 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.099 | 0.165 | 1.669 | 0.285 | torch.Size([480]) || stage2.reshape.1.weight + | -0.009 | -0.723 | 0.825 | 0.237 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.767 | 0.672 | 0.163 | torch.Size([120, 480]) || stage2.reshape.2.weight + | -0.007 | -0.473 | 0.285 | 0.116 | torch.Size([120]) || stage2.reshape.2.bias + | 0.665 | 0.267 | 1.019 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.152 | -0.897 | 0.303 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.208 | -1.940 | 4.459 | 0.383 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.653 | 0.613 | 0.127 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.263 | 0.270 | 0.066 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | 0.002 | -0.796 | 0.596 | 0.108 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.008 | -0.955 | 0.285 | 0.127 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -1.099 | 0.979 | 0.109 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.131 | 0.090 | 0.022 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.548 | 0.301 | 0.671 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.744 | 0.803 | 0.231 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.645 | 0.555 | 0.133 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | 0.013 | -0.406 | 0.272 | 0.097 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.622 | 0.666 | 0.147 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.002 | -0.228 | 0.307 | 0.085 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.834 | 0.822 | 0.149 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.009 | -0.948 | 0.446 | 0.159 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.777 | 0.311 | 1.104 | 0.161 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.178 | -0.966 | 0.822 | 0.247 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.387 | -2.000 | 5.826 | 0.443 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.662 | 0.706 | 0.132 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | -0.006 | -0.348 | 0.306 | 0.079 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | -0.001 | -0.595 | 0.730 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.001 | -0.811 | 0.531 | 0.167 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -1.007 | 1.002 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.180 | 0.108 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.599 | 0.282 | 0.730 | 0.059 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | -0.004 | -0.671 | 0.938 | 0.218 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.536 | 0.570 | 0.134 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.022 | -0.540 | 0.226 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.646 | 0.589 | 0.149 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.008 | -0.203 | 0.282 | 0.092 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -1.052 | 0.649 | 0.150 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.581 | 0.467 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.780 | 0.134 | 1.161 | 0.193 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.152 | -0.996 | 1.042 | 0.227 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.186 | -2.565 | 4.152 | 0.428 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | 0.001 | -0.856 | 0.814 | 0.151 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.367 | 0.317 | 0.074 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.656 | 0.730 | 0.131 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | -0.003 | -0.555 | 0.620 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -2.191 | 2.575 | 0.137 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.121 | 0.139 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.640 | 0.297 | 0.797 | 0.064 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | -0.013 | -0.584 | 0.934 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.523 | 0.556 | 0.136 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.035 | -0.490 | 0.217 | 0.117 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.679 | 0.601 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.287 | 0.308 | 0.098 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.576 | 0.584 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.423 | 0.376 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.776 | 0.134 | 1.030 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.167 | -0.870 | 1.066 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.259 | -1.735 | 5.189 | 0.366 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.292 | 1.255 | 0.149 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.493 | 0.445 | 0.101 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.618 | 0.582 | 0.122 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.001 | -0.543 | 0.420 | 0.166 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | 0.002 | -2.296 | 2.630 | 0.162 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.130 | 0.149 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.625 | 0.301 | 0.772 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | -0.015 | -0.498 | 0.992 | 0.198 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.620 | 0.681 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.006 | -0.391 | 0.256 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.575 | 0.669 | 0.152 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.225 | 0.333 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.680 | 0.639 | 0.151 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.011 | -0.549 | 0.259 | 0.139 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.933 | 0.310 | 1.186 | 0.121 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.180 | -0.736 | 1.168 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.164 | -2.965 | 4.145 | 0.437 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.860 | 0.749 | 0.136 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | 0.005 | -0.274 | 0.308 | 0.080 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.648 | 0.681 | 0.129 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.547 | 0.295 | 0.149 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.647 | 0.577 | 0.105 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.138 | 0.125 | 0.023 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.635 | 0.329 | 0.748 | 0.049 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.375 | 0.891 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.603 | 0.497 | 0.130 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.010 | -0.345 | 0.297 | 0.113 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.680 | 0.679 | 0.153 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.200 | 0.251 | 0.086 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.568 | 0.614 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.009 | -0.375 | 0.493 | 0.135 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.870 | 0.315 | 1.059 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.139 | -0.657 | 1.107 | 0.163 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.156 | -4.167 | 4.651 | 0.340 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.701 | 0.871 | 0.134 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.427 | 0.471 | 0.099 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.520 | 0.546 | 0.113 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | -0.008 | -0.360 | 0.350 | 0.137 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.510 | 0.502 | 0.100 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.125 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.597 | 0.345 | 0.691 | 0.044 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | -0.015 | -0.367 | 0.987 | 0.132 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.552 | 0.532 | 0.128 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.009 | -0.336 | 0.253 | 0.107 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.644 | 0.758 | 0.154 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.243 | 0.264 | 0.088 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.667 | 0.621 | 0.152 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | -0.002 | -0.447 | 1.139 | 0.183 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.002 | -0.268 | 0.331 | 0.066 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.005 | -0.338 | 0.589 | 0.128 | torch.Size([120]) || stage2.linear1.bias + | 0.939 | 0.517 | 1.207 | 0.113 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.023 | -0.770 | 0.614 | 0.238 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.004 | -3.112 | 1.341 | 0.140 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.605 | 0.580 | 0.136 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.591 | 0.477 | 0.112 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.645 | 0.613 | 0.150 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.031 | -0.422 | 0.330 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.684 | 0.501 | 0.807 | 0.061 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.018 | -0.693 | 0.412 | 0.181 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.559 | 0.715 | 0.125 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | 0.031 | -0.346 | 0.273 | 0.108 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.744 | 0.559 | 0.146 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.239 | 0.270 | 0.080 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.603 | 0.871 | 0.144 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.317 | 0.303 | 0.122 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 0.974 | 0.575 | 1.211 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.023 | -0.703 | 0.556 | 0.208 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.012 | -2.867 | 1.552 | 0.185 | torch.Size([3375, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.743 | 0.663 | 0.142 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.647 | 0.654 | 0.141 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.610 | 0.648 | 0.151 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | -0.028 | -0.565 | 0.416 | 0.167 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.742 | 0.522 | 0.891 | 0.076 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.020 | -0.506 | 0.335 | 0.138 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.512 | 0.123 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.094 | -0.405 | 0.617 | 0.174 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.618 | 0.596 | 0.149 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.001 | -0.276 | 0.202 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.668 | 0.769 | 0.148 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.014 | -0.729 | 0.410 | 0.187 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.309 | 0.381 | 0.079 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.017 | -0.403 | 0.399 | 0.133 | torch.Size([120]) || stage2.linear2.bias + | -0.000 | -0.111 | 0.126 | 0.024 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | 0.001 | -0.031 | 0.055 | 0.017 | torch.Size([120]) || stage2.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.021 | 0.012 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | -0.001 | -0.113 | 0.096 | 0.020 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.010 | -0.089 | 0.087 | 0.032 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.079 | 0.087 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.015 | -0.134 | 0.121 | 0.058 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage2.pa_deform.conv_offset.6.bias + | 0.004 | -1.011 | 1.138 | 0.150 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.151 | -0.228 | 0.674 | 0.167 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | 0.001 | -0.988 | 1.066 | 0.144 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | 0.009 | -0.418 | 0.533 | 0.127 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.784 | 0.831 | 0.151 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.007 | -0.581 | 0.470 | 0.257 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.105 | 0.504 | 1.774 | 0.248 | torch.Size([480]) || stage3.reshape.1.weight + | -0.006 | -0.633 | 0.736 | 0.296 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.682 | 0.687 | 0.168 | torch.Size([120, 480]) || stage3.reshape.2.weight + | -0.004 | -0.207 | 0.227 | 0.086 | torch.Size([120]) || stage3.reshape.2.bias + | 0.735 | 0.431 | 0.997 | 0.127 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.162 | -0.753 | 0.303 | 0.198 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.490 | 0.344 | 0.037 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.333 | 0.350 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | -0.004 | -0.195 | 0.128 | 0.039 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.359 | 0.365 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | -0.002 | -0.216 | 0.262 | 0.084 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.597 | 0.657 | 0.058 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.115 | 0.118 | 0.020 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.594 | 0.414 | 0.775 | 0.069 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.260 | 0.315 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.446 | 0.536 | 0.116 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.077 | -0.361 | 0.145 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.507 | 0.503 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.225 | 0.207 | 0.062 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.553 | 0.493 | 0.129 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | -0.006 | -0.268 | 0.158 | 0.085 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.716 | 0.376 | 0.965 | 0.119 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.185 | -0.732 | 0.209 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | -0.002 | -0.462 | 1.414 | 0.064 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.383 | 0.438 | 0.060 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.229 | 0.157 | 0.044 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.357 | 0.478 | 0.065 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | -0.004 | -0.280 | 0.216 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.471 | 0.517 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.112 | 0.131 | 0.022 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.633 | 0.486 | 0.778 | 0.057 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.350 | 0.280 | 0.107 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.513 | 0.512 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.081 | -0.274 | 0.096 | 0.071 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.548 | 0.533 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | -0.003 | -0.181 | 0.194 | 0.059 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.499 | 0.534 | 0.128 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.282 | 0.152 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.796 | 0.469 | 1.007 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.109 | -0.638 | 0.181 | 0.146 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.004 | -1.009 | 1.155 | 0.105 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.378 | 0.375 | 0.081 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.263 | 0.331 | 0.066 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.485 | 0.366 | 0.074 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.249 | 0.145 | 0.080 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.332 | 0.421 | 0.063 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.001 | -0.098 | 0.083 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.657 | 0.507 | 0.776 | 0.053 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.270 | 0.280 | 0.104 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.445 | 0.556 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.097 | -0.295 | 0.100 | 0.070 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.480 | 0.501 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.148 | 0.191 | 0.060 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.569 | 0.484 | 0.126 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.246 | 0.161 | 0.082 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.814 | 0.482 | 1.048 | 0.109 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.138 | -0.585 | 0.128 | 0.129 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | -0.008 | -1.801 | 4.148 | 0.110 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | -0.001 | -0.364 | 0.546 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.179 | 0.182 | 0.046 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.378 | 0.385 | 0.070 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | -0.005 | -0.368 | 0.175 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.338 | 0.461 | 0.062 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.098 | 0.082 | 0.019 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.676 | 0.526 | 0.799 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.269 | 0.242 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.474 | 0.505 | 0.118 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.095 | -0.247 | 0.071 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.518 | 0.502 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.003 | -0.194 | 0.228 | 0.068 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.502 | 0.499 | 0.124 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | -0.007 | -0.248 | 0.207 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.843 | 0.498 | 1.046 | 0.099 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.456 | 0.195 | 0.111 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | -0.012 | -3.133 | 2.263 | 0.177 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.001 | -0.494 | 0.443 | 0.096 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.492 | 0.329 | 0.088 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.464 | 0.391 | 0.080 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.420 | 0.332 | 0.124 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | 0.001 | -0.469 | 0.518 | 0.068 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.068 | 0.099 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.705 | 0.598 | 0.823 | 0.047 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.161 | 0.155 | 0.065 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.526 | 0.442 | 0.119 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.102 | -0.319 | 0.054 | 0.072 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.555 | 0.499 | 0.126 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | -0.003 | -0.201 | 0.135 | 0.065 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | 0.001 | -0.454 | 0.522 | 0.122 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | -0.011 | -0.379 | 0.195 | 0.091 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.856 | 0.618 | 1.073 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.059 | -0.368 | 0.153 | 0.095 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | -0.006 | -1.747 | 1.724 | 0.133 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.399 | 0.417 | 0.090 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | 0.009 | -0.294 | 0.398 | 0.079 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.345 | 0.341 | 0.067 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | -0.004 | -0.435 | 0.326 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.370 | 0.339 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.059 | 0.060 | 0.012 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.707 | 0.600 | 0.832 | 0.051 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.157 | 0.140 | 0.063 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.473 | 0.464 | 0.117 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.091 | -0.291 | 0.092 | 0.073 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.479 | 0.477 | 0.124 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.197 | 0.180 | 0.063 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.504 | 0.440 | 0.118 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | -0.008 | -0.449 | 0.421 | 0.135 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.331 | 0.524 | 0.083 | torch.Size([120, 120]) || stage3.linear1.weight + | -0.001 | -0.270 | 0.250 | 0.116 | torch.Size([120]) || stage3.linear1.bias + | 0.883 | 0.354 | 1.107 | 0.120 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.011 | -0.416 | 0.299 | 0.131 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.322 | 0.139 | 0.028 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.470 | 0.455 | 0.097 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.007 | -0.384 | 0.374 | 0.125 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.467 | 0.428 | 0.109 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.348 | 0.279 | 0.126 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.873 | 0.618 | 1.060 | 0.070 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.242 | 0.278 | 0.098 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.549 | 0.437 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.053 | -0.174 | 0.127 | 0.058 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.469 | 0.517 | 0.124 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.133 | 0.187 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.548 | 0.557 | 0.125 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | -0.011 | -0.339 | 0.303 | 0.116 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.960 | 0.744 | 1.153 | 0.095 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.004 | -0.302 | 0.238 | 0.099 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.567 | 0.133 | 0.032 | torch.Size([3375, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.425 | 0.414 | 0.087 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | 0.001 | -0.419 | 0.485 | 0.116 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.429 | 0.385 | 0.095 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.398 | 0.287 | 0.123 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 0.909 | 0.770 | 1.090 | 0.066 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.204 | 0.175 | 0.073 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.451 | 0.462 | 0.115 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.069 | -0.268 | 0.143 | 0.077 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.488 | 0.602 | 0.126 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.004 | -0.179 | 0.114 | 0.050 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.480 | 0.466 | 0.118 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.007 | -0.358 | 0.225 | 0.102 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.274 | 0.457 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.532 | 0.438 | 0.200 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.098 | 0.115 | 0.025 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | 0.002 | -0.033 | 0.041 | 0.015 | torch.Size([120]) || stage3.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.010 | -0.030 | 0.017 | 0.010 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.000 | -0.078 | 0.069 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.006 | -0.055 | 0.067 | 0.026 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | -0.001 | -0.071 | 0.067 | 0.020 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | 0.004 | -0.070 | 0.113 | 0.042 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage3.pa_deform.conv_offset.6.bias + | 0.004 | -0.623 | 0.669 | 0.126 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.092 | -0.221 | 0.676 | 0.151 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | 0.000 | -0.604 | 0.689 | 0.125 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.008 | -0.544 | 0.379 | 0.118 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | 0.000 | -0.669 | 0.719 | 0.151 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.005 | -0.411 | 0.443 | 0.155 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 1.005 | 0.488 | 1.503 | 0.166 | torch.Size([480]) || stage4.reshape.1.weight + | 0.001 | -0.316 | 0.358 | 0.118 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.486 | 0.450 | 0.084 | torch.Size([120, 480]) || stage4.reshape.2.weight + | -0.007 | -0.139 | 0.092 | 0.043 | torch.Size([120]) || stage4.reshape.2.bias + | 0.996 | 0.831 | 1.101 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.109 | 0.112 | 0.040 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.064 | 0.064 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.109 | 0.107 | 0.023 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.001 | -0.033 | 0.029 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.256 | 0.235 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.007 | -0.099 | 0.227 | 0.051 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.129 | 0.142 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.035 | 0.029 | 0.006 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.966 | 0.869 | 1.089 | 0.041 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.155 | 0.152 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.248 | 0.221 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | -0.002 | -0.066 | 0.012 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.287 | 0.219 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.085 | 0.067 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.256 | 0.235 | 0.025 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.009 | -0.123 | 0.254 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 0.988 | 0.825 | 1.079 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.013 | -0.123 | 0.105 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.081 | 0.078 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.133 | 0.170 | 0.025 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.053 | 0.048 | 0.014 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.177 | 0.174 | 0.031 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | 0.008 | -0.099 | 0.204 | 0.048 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.138 | 0.130 | 0.026 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.061 | 0.059 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.996 | 0.943 | 1.081 | 0.026 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.001 | -0.064 | 0.051 | 0.027 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.336 | 0.268 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.029 | 0.028 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.223 | 0.272 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.084 | 0.037 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.207 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.007 | -0.140 | 0.216 | 0.058 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.855 | 1.108 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.019 | -0.115 | 0.091 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.063 | 0.076 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.190 | 0.179 | 0.027 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.001 | -0.043 | 0.039 | 0.011 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.158 | 0.161 | 0.030 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | 0.008 | -0.118 | 0.164 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.213 | 0.211 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.043 | 0.040 | 0.010 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.903 | 1.099 | 0.028 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.097 | 0.106 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.186 | 0.177 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.068 | 0.045 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.307 | 0.185 | 0.024 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | -0.000 | -0.081 | 0.061 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.195 | 0.216 | 0.024 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.008 | -0.115 | 0.161 | 0.050 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 0.997 | 0.893 | 1.071 | 0.032 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.019 | -0.083 | 0.047 | 0.024 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.076 | 0.073 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.275 | 0.259 | 0.029 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.071 | 0.066 | 0.017 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.166 | 0.157 | 0.028 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | 0.008 | -0.105 | 0.149 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.184 | 0.197 | 0.028 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.042 | 0.050 | 0.008 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.001 | 0.971 | 1.136 | 0.022 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | -0.002 | -0.054 | 0.050 | 0.023 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.329 | 0.210 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.078 | 0.029 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.234 | 0.241 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.024 | 0.006 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.169 | 0.164 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.007 | -0.085 | 0.114 | 0.043 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.901 | 1.099 | 0.044 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.034 | -0.095 | 0.039 | 0.030 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.071 | 0.090 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.238 | 0.268 | 0.034 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.199 | 0.144 | 0.030 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.167 | 0.218 | 0.029 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | 0.008 | -0.089 | 0.140 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.267 | 0.253 | 0.031 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.067 | 0.069 | 0.009 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.004 | 0.953 | 1.056 | 0.014 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | -0.001 | -0.056 | 0.077 | 0.021 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.170 | 0.184 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.037 | 0.027 | 0.007 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.149 | 0.202 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.059 | 0.095 | 0.010 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.145 | 0.181 | 0.023 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | 0.006 | -0.086 | 0.117 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 0.996 | 0.859 | 1.077 | 0.047 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.153 | 0.009 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.087 | 0.083 | 0.021 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.249 | 0.266 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.199 | 0.168 | 0.031 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.156 | 0.142 | 0.027 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.102 | 0.145 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.299 | 0.376 | 0.033 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.034 | 0.066 | 0.007 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.992 | 0.924 | 1.097 | 0.025 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.002 | -0.089 | 0.074 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.192 | 0.208 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | -0.002 | -0.064 | 0.021 | 0.009 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.240 | 0.191 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.040 | 0.044 | 0.008 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.141 | 0.155 | 0.022 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.107 | 0.103 | 0.045 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.286 | 0.303 | 0.059 | torch.Size([120, 120]) || stage4.linear1.weight + | -0.012 | -0.311 | 0.190 | 0.090 | torch.Size([120]) || stage4.linear1.bias + | 1.009 | 0.926 | 1.101 | 0.028 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.036 | 0.048 | 0.015 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.071 | 0.076 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.135 | 0.141 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.023 | 0.021 | 0.007 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.115 | 0.121 | 0.025 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.200 | 0.098 | 0.043 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.002 | 0.999 | 1.016 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.082 | 0.094 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | 0.000 | -0.005 | 0.017 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.010 | 0.008 | 0.002 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.090 | 0.105 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.181 | 0.096 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.006 | 0.923 | 1.098 | 0.025 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.045 | 0.053 | 0.019 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.083 | 0.085 | 0.020 | torch.Size([3375, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.132 | 0.133 | 0.023 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.030 | 0.035 | 0.009 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.129 | 0.094 | 0.024 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | -0.008 | -0.218 | 0.116 | 0.048 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.003 | 0.999 | 1.024 | 0.003 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.126 | 0.080 | 0.021 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | 0.001 | -0.006 | 0.016 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.015 | 0.013 | 0.003 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.091 | 0.115 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | -0.006 | -0.196 | 0.090 | 0.041 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.291 | 0.416 | 0.059 | torch.Size([120, 120]) || stage4.linear2.weight + | -0.009 | -0.269 | 0.198 | 0.094 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.053 | 0.057 | 0.019 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | -0.001 | -0.021 | 0.021 | 0.009 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.000 | -0.015 | 0.015 | 0.009 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | -0.000 | -0.039 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.000 | -0.030 | 0.029 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | -0.000 | -0.045 | 0.041 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.031 | 0.030 | 0.016 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage4.pa_deform.conv_offset.6.bias + | -0.000 | -0.356 | 0.435 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.003 | -0.080 | 0.304 | 0.033 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.361 | 0.436 | 0.035 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | -0.001 | -0.166 | 0.299 | 0.032 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | -0.000 | -0.748 | 0.752 | 0.056 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.000 | -0.262 | 0.270 | 0.086 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.980 | 0.710 | 1.274 | 0.146 | torch.Size([30]) || stage5.reshape.1.weight + | -0.002 | -0.062 | 0.057 | 0.036 | torch.Size([30]) || stage5.reshape.1.bias + | 0.001 | -0.530 | 0.432 | 0.092 | torch.Size([120, 30]) || stage5.reshape.2.weight + | 0.021 | -0.305 | 0.337 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 0.994 | 0.934 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.014 | -0.040 | 0.038 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.082 | 0.072 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.078 | 0.101 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.022 | 0.023 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.198 | 0.237 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.003 | -0.067 | 0.082 | 0.027 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.103 | 0.092 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.991 | 0.929 | 1.004 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.001 | -0.009 | 0.014 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.112 | 0.093 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | -0.001 | -0.033 | 0.027 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.098 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.033 | 0.026 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.163 | 0.140 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.003 | -0.060 | 0.110 | 0.032 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 0.992 | 0.872 | 1.010 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.015 | -0.039 | 0.031 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.088 | 0.099 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.030 | 0.030 | 0.006 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.151 | 0.185 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | -0.005 | -0.073 | 0.061 | 0.024 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.093 | 0.089 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.997 | 0.923 | 1.003 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.008 | 0.009 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.082 | 0.092 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.023 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.082 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.028 | 0.025 | 0.008 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.090 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.062 | 0.102 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 0.994 | 0.845 | 1.015 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.018 | -0.045 | 0.016 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.065 | 0.068 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.088 | 0.113 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.022 | 0.020 | 0.005 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.124 | 0.124 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.061 | 0.049 | 0.020 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.087 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.005 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.993 | 0.847 | 1.012 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.014 | 0.015 | 0.007 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.096 | 0.096 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | 0.001 | -0.038 | 0.027 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.090 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.045 | 0.039 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.153 | 0.130 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.097 | 0.083 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 0.984 | 0.798 | 1.006 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.018 | -0.042 | 0.003 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.074 | 0.214 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.133 | 0.132 | 0.022 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.035 | 0.037 | 0.008 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.121 | 0.123 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.043 | 0.049 | 0.016 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.082 | 0.093 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.993 | 0.809 | 1.008 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.001 | -0.018 | 0.013 | 0.006 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.100 | 0.097 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.038 | 0.045 | 0.009 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.104 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.043 | 0.040 | 0.011 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.108 | 0.121 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | 0.002 | -0.066 | 0.048 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 0.988 | 0.835 | 1.035 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.022 | -0.052 | 0.003 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.086 | 0.118 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.199 | 0.223 | 0.023 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.045 | 0.028 | 0.009 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.114 | 0.143 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.060 | 0.047 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.117 | 0.102 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.994 | 0.774 | 1.007 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.001 | -0.023 | 0.027 | 0.010 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.085 | 0.107 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | 0.003 | -0.044 | 0.042 | 0.013 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.103 | 0.080 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.067 | 0.058 | 0.015 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.096 | 0.103 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.045 | 0.054 | 0.023 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 0.985 | 0.552 | 1.092 | 0.044 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.023 | -0.073 | 0.024 | 0.019 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.080 | 0.121 | 0.021 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | -0.000 | -1.776 | 0.186 | 0.026 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.070 | 0.065 | 0.015 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.230 | 0.359 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | -0.001 | -0.062 | 0.079 | 0.028 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.086 | 0.104 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.976 | 0.863 | 0.995 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | -0.001 | -0.037 | 0.053 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.121 | 0.100 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | 0.009 | -0.074 | 0.101 | 0.021 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.102 | 0.101 | 0.021 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.092 | 0.082 | 0.028 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.148 | 0.202 | 0.022 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.056 | 0.054 | 0.025 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.139 | 0.123 | 0.024 | torch.Size([120, 120]) || stage5.linear1.weight + | 0.022 | -0.317 | 0.336 | 0.081 | torch.Size([120]) || stage5.linear1.bias + | 0.963 | 0.765 | 1.026 | 0.058 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.001 | -0.315 | 0.286 | 0.078 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.159 | 0.119 | 0.022 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.038 | 0.044 | 0.013 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.134 | 0.126 | 0.024 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.005 | -0.263 | 0.230 | 0.060 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.990 | 0.913 | 1.001 | 0.017 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.009 | 0.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.077 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.004 | -0.025 | 0.016 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.073 | 0.090 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | -0.000 | -0.018 | 0.018 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.264 | 0.273 | 0.056 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.976 | 0.733 | 1.048 | 0.053 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | -0.001 | -0.265 | 0.241 | 0.061 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.079 | 0.081 | 0.020 | torch.Size([3375, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.145 | 0.145 | 0.023 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | -0.000 | -0.031 | 0.051 | 0.009 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.114 | 0.103 | 0.025 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.011 | -0.166 | 0.119 | 0.032 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.993 | 0.939 | 1.001 | 0.012 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.011 | 0.008 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.002 | -0.026 | 0.020 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.092 | 0.078 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.020 | 0.021 | 0.007 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.097 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | -0.016 | -0.224 | 0.158 | 0.041 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.244 | 0.248 | 0.044 | torch.Size([120, 120]) || stage5.linear2.weight + | 0.022 | -0.367 | 0.377 | 0.103 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.153 | 0.112 | 0.022 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | -0.004 | -0.061 | 0.053 | 0.023 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.010 | -0.038 | 0.022 | 0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.081 | 0.076 | 0.020 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | -0.008 | -0.062 | 0.031 | 0.021 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | -0.000 | -0.080 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.057 | 0.035 | 0.020 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.590 | 0.536 | 0.063 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.075 | -0.075 | 0.431 | 0.094 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | 0.000 | -0.704 | 0.718 | 0.064 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | 0.005 | -0.308 | 0.337 | 0.073 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | 0.000 | -0.702 | 0.735 | 0.101 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.005 | -0.422 | 0.451 | 0.157 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.444 | 1.141 | 1.615 | 0.121 | torch.Size([30]) || stage6.reshape.1.weight + | -0.003 | -0.150 | 0.115 | 0.074 | torch.Size([30]) || stage6.reshape.1.bias + | 0.001 | -0.848 | 0.822 | 0.232 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.004 | -0.514 | 0.640 | 0.181 | torch.Size([120]) || stage6.reshape.2.bias + | 0.557 | 0.119 | 0.895 | 0.153 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.070 | -0.374 | 0.181 | 0.100 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | 0.001 | -0.438 | 0.141 | 0.054 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.339 | 0.306 | 0.051 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.005 | -0.318 | 0.257 | 0.059 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.473 | 0.491 | 0.061 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.330 | 0.253 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.361 | 0.307 | 0.045 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.044 | 0.053 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.521 | 0.121 | 0.882 | 0.143 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | 0.003 | -0.212 | 0.271 | 0.104 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.360 | 0.360 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.095 | -0.280 | 0.021 | 0.059 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.354 | 0.331 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.005 | -0.196 | 0.129 | 0.048 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.486 | 0.379 | 0.080 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.001 | -0.154 | 0.154 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.587 | 0.200 | 0.865 | 0.122 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.118 | -0.374 | 0.082 | 0.089 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.423 | 0.140 | 0.050 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.315 | 0.354 | 0.057 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.184 | 0.148 | 0.047 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.626 | 0.422 | 0.060 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | 0.004 | -0.234 | 0.187 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.692 | 0.743 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.038 | 0.041 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.590 | 0.287 | 0.942 | 0.125 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | -0.006 | -0.196 | 0.203 | 0.076 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.427 | 0.431 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.080 | -0.242 | 0.033 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.293 | 0.362 | 0.069 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | 0.001 | -0.171 | 0.207 | 0.047 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.423 | 0.467 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.152 | 0.184 | 0.057 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.703 | 0.255 | 1.008 | 0.132 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.125 | -0.342 | 0.042 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.381 | 0.350 | 0.052 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.426 | 0.500 | 0.058 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | -0.003 | -0.262 | 0.226 | 0.054 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.299 | 0.325 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.149 | 0.096 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.406 | 0.391 | 0.055 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.055 | 0.085 | 0.015 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.666 | 0.308 | 0.942 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | -0.005 | -0.203 | 0.265 | 0.086 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.349 | 0.494 | 0.072 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.071 | -0.213 | 0.071 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.294 | 0.408 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.120 | 0.147 | 0.049 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.303 | 0.304 | 0.073 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | -0.005 | -0.150 | 0.129 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.702 | 0.307 | 0.960 | 0.129 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.100 | -0.262 | 0.057 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.001 | -0.501 | 0.290 | 0.062 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.349 | 0.336 | 0.061 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.287 | 0.202 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.322 | 0.401 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.004 | -0.182 | 0.151 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.441 | 0.444 | 0.054 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.038 | 0.033 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.666 | 0.317 | 0.970 | 0.117 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | -0.003 | -0.173 | 0.168 | 0.067 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.354 | 0.408 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.072 | -0.297 | 0.067 | 0.065 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.299 | 0.335 | 0.066 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | -0.004 | -0.191 | 0.136 | 0.060 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.400 | 0.590 | 0.071 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | -0.005 | -0.159 | 0.142 | 0.061 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.730 | 0.334 | 0.963 | 0.118 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.064 | -0.201 | 0.064 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.702 | 1.180 | 0.086 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.483 | 0.398 | 0.073 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | 0.004 | -0.480 | 0.514 | 0.080 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.331 | 0.390 | 0.056 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | -0.004 | -0.141 | 0.167 | 0.050 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.387 | 0.470 | 0.048 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.065 | 0.039 | 0.010 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.656 | 0.235 | 0.874 | 0.105 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.005 | -0.237 | 0.171 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.440 | 0.483 | 0.075 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.076 | -0.347 | 0.110 | 0.076 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.286 | 0.348 | 0.070 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.001 | -0.189 | 0.169 | 0.069 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.398 | 0.336 | 0.075 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | -0.004 | -0.127 | 0.137 | 0.052 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.691 | 0.178 | 0.975 | 0.116 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.042 | -0.137 | 0.099 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.662 | 1.078 | 0.078 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.359 | 0.531 | 0.072 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.293 | 0.311 | 0.075 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.426 | 0.488 | 0.055 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | -0.006 | -0.103 | 0.159 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.401 | 0.385 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.039 | 0.043 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.607 | 0.210 | 0.802 | 0.094 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | -0.004 | -0.178 | 0.199 | 0.068 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.377 | 0.541 | 0.079 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.069 | -0.429 | 0.280 | 0.096 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.394 | 0.344 | 0.077 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.241 | 0.223 | 0.085 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.527 | 0.647 | 0.077 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | -0.006 | -0.126 | 0.157 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.001 | -0.294 | 0.287 | 0.060 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.006 | -0.543 | 0.664 | 0.193 | torch.Size([120]) || stage6.linear1.bias + | 0.674 | 0.222 | 1.065 | 0.154 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.480 | 0.311 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.629 | 0.461 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.495 | 0.440 | 0.085 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.516 | 0.468 | 0.114 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.369 | 0.377 | 0.085 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | -0.003 | -0.297 | 0.292 | 0.113 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.644 | 0.181 | 1.104 | 0.153 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | 0.003 | -0.167 | 0.185 | 0.070 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.383 | 0.534 | 0.087 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.101 | -0.214 | 0.048 | 0.051 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.350 | 0.560 | 0.085 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | -0.005 | -0.159 | 0.138 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.374 | 0.488 | 0.091 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.271 | 0.252 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.663 | 0.353 | 0.959 | 0.106 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.314 | 0.289 | 0.089 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.772 | 0.763 | 0.041 | torch.Size([3375, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.495 | 0.604 | 0.086 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.005 | -0.491 | 0.401 | 0.097 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.380 | 0.376 | 0.076 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.007 | -0.321 | 0.234 | 0.096 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.666 | 0.226 | 1.153 | 0.138 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | 0.001 | -0.178 | 0.220 | 0.069 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.514 | 0.608 | 0.090 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.132 | -0.313 | 0.023 | 0.059 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.423 | 0.488 | 0.088 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.153 | 0.122 | 0.053 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.399 | 0.435 | 0.087 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.285 | 0.241 | 0.093 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.308 | 0.365 | 0.070 | torch.Size([120, 120]) || stage6.linear2.weight + | -0.002 | -0.699 | 0.757 | 0.303 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.130 | 0.129 | 0.027 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | -0.001 | -0.051 | 0.045 | 0.018 | torch.Size([120]) || stage6.pa_deform.bias + | -0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.049 | 0.026 | 0.012 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.090 | 0.114 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.008 | -0.070 | 0.060 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.001 | -0.097 | 0.101 | 0.020 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.006 | -0.096 | 0.114 | 0.044 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage6.pa_deform.conv_offset.6.bias + | -0.002 | -0.822 | 0.740 | 0.127 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.212 | -0.394 | 0.913 | 0.216 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | -0.000 | -0.948 | 0.848 | 0.131 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.001 | -0.657 | 0.605 | 0.279 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.678 | 0.823 | 0.158 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.009 | -0.616 | 0.477 | 0.283 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.363 | 1.278 | 1.458 | 0.048 | torch.Size([30]) || stage7.reshape.1.weight + | -0.001 | -0.247 | 0.227 | 0.139 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.590 | 0.587 | 0.179 | torch.Size([120, 30]) || stage7.reshape.2.weight + | -0.029 | -0.525 | 0.546 | 0.231 | torch.Size([120]) || stage7.reshape.2.bias + | 0.406 | 0.101 | 0.864 | 0.138 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.159 | -0.667 | 0.525 | 0.161 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.174 | -2.385 | 4.798 | 0.381 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.809 | 0.687 | 0.111 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.275 | 0.262 | 0.057 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.416 | 0.438 | 0.096 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | 0.008 | -0.499 | 0.295 | 0.131 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.494 | 1.378 | 0.106 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.123 | 0.106 | 0.015 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.284 | 0.172 | 0.377 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.502 | 0.588 | 0.124 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.597 | 0.567 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.061 | -0.420 | 0.409 | 0.104 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.606 | 0.601 | 0.144 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | -0.003 | -0.306 | 0.261 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.572 | 0.609 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | -0.008 | -0.373 | 0.306 | 0.099 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.538 | 0.114 | 0.809 | 0.125 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.129 | -0.865 | 0.532 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.281 | -2.710 | 4.413 | 0.432 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.646 | 0.655 | 0.135 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.301 | 0.303 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.479 | 0.463 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.016 | -0.460 | 0.313 | 0.135 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -2.205 | 2.065 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.074 | 0.085 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.353 | 0.243 | 0.425 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | -0.008 | -0.643 | 0.628 | 0.146 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.535 | 0.617 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.054 | -0.348 | 0.244 | 0.109 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.001 | -0.671 | 0.611 | 0.148 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.004 | -0.272 | 0.292 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.672 | 0.595 | 0.149 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.398 | 0.273 | 0.088 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.581 | 0.093 | 0.791 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.143 | -1.023 | 0.481 | 0.167 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.098 | -2.171 | 4.402 | 0.287 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.640 | 0.701 | 0.147 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.005 | -0.328 | 0.408 | 0.072 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.417 | 0.441 | 0.101 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.007 | -0.508 | 0.265 | 0.127 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -2.511 | 2.484 | 0.143 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.093 | 0.104 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.392 | 0.276 | 0.487 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | -0.016 | -0.555 | 0.581 | 0.143 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.630 | 0.674 | 0.135 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.072 | -0.420 | 0.173 | 0.115 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.654 | 0.793 | 0.152 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.303 | 0.263 | 0.098 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.603 | 0.658 | 0.150 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.003 | -0.301 | 0.247 | 0.081 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.611 | 0.127 | 0.811 | 0.134 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.137 | -0.781 | 0.684 | 0.164 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.109 | -4.577 | 4.527 | 0.332 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.757 | 0.743 | 0.146 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.358 | 0.342 | 0.083 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | 0.001 | -0.465 | 0.447 | 0.097 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.389 | 0.233 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -1.947 | 1.928 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.070 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.410 | 0.283 | 0.489 | 0.035 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | -0.014 | -0.442 | 0.639 | 0.147 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.542 | 0.585 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.069 | -0.463 | 0.214 | 0.122 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.689 | 0.605 | 0.154 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | -0.008 | -0.307 | 0.279 | 0.096 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.593 | 0.603 | 0.152 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.269 | 0.270 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.652 | 0.132 | 0.859 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.131 | -0.662 | 0.729 | 0.163 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.092 | -4.521 | 3.027 | 0.337 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.694 | 0.828 | 0.148 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.328 | 0.361 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.430 | 0.483 | 0.100 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.368 | 0.250 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -1.506 | 1.779 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.090 | 0.112 | 0.020 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.435 | 0.347 | 0.536 | 0.033 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | -0.018 | -0.345 | 0.609 | 0.136 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | -0.001 | -0.580 | 0.558 | 0.132 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.066 | -0.392 | 0.239 | 0.128 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.608 | 0.667 | 0.157 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.001 | -0.276 | 0.296 | 0.105 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.666 | 0.775 | 0.155 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.380 | 0.360 | 0.101 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.648 | 0.269 | 0.885 | 0.109 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.116 | -0.436 | 0.749 | 0.144 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.130 | -3.976 | 4.665 | 0.318 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.702 | 0.671 | 0.140 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.346 | 0.340 | 0.078 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.410 | 0.394 | 0.091 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.006 | -0.286 | 0.244 | 0.100 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.870 | 0.885 | 0.109 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.120 | 0.096 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.445 | 0.326 | 0.595 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | -0.016 | -0.233 | 0.558 | 0.110 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.576 | 0.577 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.038 | -0.525 | 0.269 | 0.139 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.672 | 0.671 | 0.158 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.400 | 0.281 | 0.116 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.937 | 0.714 | 0.156 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.007 | -0.435 | 0.876 | 0.188 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.234 | 0.212 | 0.056 | torch.Size([120, 120]) || stage7.linear1.weight + | -0.033 | -0.655 | 0.586 | 0.242 | torch.Size([120]) || stage7.linear1.bias + | 0.684 | 0.257 | 0.867 | 0.090 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | -0.003 | -0.857 | 0.829 | 0.193 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.005 | -5.628 | 1.358 | 0.121 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.699 | 0.827 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | 0.001 | -0.821 | 0.662 | 0.143 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.392 | 0.418 | 0.106 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | 0.003 | -0.147 | 0.171 | 0.052 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.431 | 0.316 | 0.521 | 0.036 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | -0.003 | -0.595 | 0.673 | 0.129 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.701 | 0.542 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | 0.017 | -0.290 | 0.421 | 0.117 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.603 | 0.637 | 0.145 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.394 | 0.426 | 0.098 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.602 | 0.607 | 0.144 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.460 | 0.272 | 0.112 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.655 | 0.251 | 0.779 | 0.074 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | -0.004 | -0.718 | 0.811 | 0.153 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.007 | -3.104 | 1.224 | 0.101 | torch.Size([3375, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.664 | 0.647 | 0.137 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.532 | 0.746 | 0.150 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.428 | 0.360 | 0.100 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | 0.009 | -0.244 | 0.242 | 0.063 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.442 | 0.284 | 0.530 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | -0.004 | -0.421 | 0.664 | 0.106 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | -0.001 | -0.604 | 0.583 | 0.119 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | 0.028 | -0.389 | 0.406 | 0.134 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | -0.001 | -0.681 | 0.818 | 0.148 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | 0.003 | -0.247 | 0.361 | 0.096 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.783 | 0.835 | 0.146 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | 0.008 | -0.529 | 0.922 | 0.144 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.353 | 0.277 | 0.071 | torch.Size([120, 120]) || stage7.linear2.weight + | -0.026 | -0.905 | 0.749 | 0.262 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.125 | 0.138 | 0.027 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | -0.003 | -0.091 | 0.071 | 0.030 | torch.Size([120]) || stage7.pa_deform.bias + | 0.000 | -0.017 | 0.017 | 0.010 | torch.Size([120, 364, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.000 | -0.028 | 0.054 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.130 | 0.111 | 0.017 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.004 | -0.105 | 0.094 | 0.040 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.203 | 0.124 | 0.016 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.027 | -0.097 | 0.151 | 0.048 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([432]) || stage7.pa_deform.conv_offset.6.bias + | -0.002 | -0.997 | 1.031 | 0.156 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.219 | -0.261 | 0.769 | 0.213 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.001 | -1.119 | 1.206 | 0.175 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.011 | -0.547 | 0.598 | 0.195 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.000 | -0.860 | 0.957 | 0.160 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | 0.018 | -1.017 | 0.731 | 0.363 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.491 | 1.080 | 1.847 | 0.135 | torch.Size([120]) || stage8.0.1.weight + | -0.012 | -0.370 | 0.414 | 0.140 | torch.Size([120]) || stage8.0.1.bias + | -0.000 | -0.882 | 1.114 | 0.177 | torch.Size([180, 120]) || stage8.0.2.weight + | -0.005 | -1.101 | 0.699 | 0.167 | torch.Size([180]) || stage8.0.2.bias + | 0.622 | 0.186 | 1.009 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.006 | -0.884 | 1.056 | 0.212 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.003 | -2.578 | 2.238 | 0.223 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.042 | 1.335 | 0.152 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.992 | 0.938 | 0.208 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.692 | 0.565 | 0.129 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | 0.009 | -1.288 | 0.895 | 0.185 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.415 | 0.180 | 0.539 | 0.066 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.006 | -0.634 | 0.818 | 0.145 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | 0.001 | -0.969 | 0.867 | 0.145 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.055 | -0.545 | 0.271 | 0.110 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.698 | 0.845 | 0.153 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.007 | -0.526 | 0.444 | 0.126 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.812 | 0.874 | 0.155 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | 0.009 | -0.468 | 0.864 | 0.160 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.724 | 0.198 | 0.915 | 0.128 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.003 | -1.026 | 0.953 | 0.209 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | 0.030 | -3.042 | 1.112 | 0.227 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.192 | 0.952 | 0.169 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | -0.009 | -1.186 | 0.822 | 0.191 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.500 | 0.647 | 0.121 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.892 | 1.020 | 0.208 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.492 | 0.230 | 0.628 | 0.064 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.006 | -0.853 | 0.872 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | 0.001 | -0.748 | 0.701 | 0.150 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.055 | -0.409 | 0.305 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.806 | 0.662 | 0.155 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.304 | 0.419 | 0.096 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.841 | 0.781 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | 0.005 | -0.280 | 0.641 | 0.119 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.803 | 0.314 | 1.038 | 0.110 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.006 | -1.202 | 1.119 | 0.207 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.002 | -2.783 | 1.481 | 0.236 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.957 | 0.943 | 0.162 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.519 | 0.526 | 0.136 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.543 | 0.516 | 0.117 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | 0.005 | -0.711 | 0.838 | 0.184 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.549 | 0.206 | 0.679 | 0.078 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.005 | -0.888 | 0.879 | 0.154 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | 0.000 | -0.748 | 0.896 | 0.148 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.073 | -0.478 | 0.193 | 0.098 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.628 | 0.674 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | -0.001 | -0.331 | 0.230 | 0.082 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.677 | 0.673 | 0.154 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | 0.004 | -0.294 | 0.745 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.843 | 0.308 | 0.966 | 0.094 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.002 | -1.222 | 1.324 | 0.192 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | 0.001 | -2.899 | 2.240 | 0.272 | torch.Size([3375, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.999 | 0.935 | 0.167 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.612 | 0.531 | 0.127 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.591 | 0.537 | 0.112 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.005 | -0.476 | 1.034 | 0.188 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.534 | 0.198 | 0.660 | 0.074 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.006 | -0.845 | 0.869 | 0.130 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | 0.001 | -0.649 | 0.677 | 0.147 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.080 | -0.378 | 0.228 | 0.109 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.628 | 0.683 | 0.157 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.005 | -0.300 | 0.222 | 0.083 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.959 | 0.733 | 0.153 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | 0.003 | -0.915 | 0.961 | 0.165 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.001 | -0.411 | 0.533 | 0.070 | torch.Size([180, 180]) || stage8.1.linear.weight + | -0.004 | -0.907 | 0.257 | 0.135 | torch.Size([180]) || stage8.1.linear.bias + | 0.890 | 0.143 | 1.178 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.034 | -0.781 | 0.959 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | 0.001 | -2.545 | 1.182 | 0.186 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -1.151 | 1.199 | 0.158 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.731 | 0.744 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.522 | 0.577 | 0.131 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | 0.003 | -0.537 | 0.895 | 0.164 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.599 | 0.203 | 0.779 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.021 | -0.429 | 1.016 | 0.143 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.000 | -0.914 | 0.736 | 0.145 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.054 | -0.545 | 0.183 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.716 | 0.750 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.254 | 0.408 | 0.085 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.842 | 0.706 | 0.153 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | 0.001 | -0.277 | 0.365 | 0.093 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.910 | 0.151 | 1.164 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.032 | -0.801 | 1.151 | 0.191 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.069 | -2.776 | 5.771 | 0.290 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.359 | 1.101 | 0.156 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.009 | -0.624 | 0.654 | 0.155 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.565 | 0.575 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.004 | -0.671 | 0.566 | 0.171 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.609 | 0.206 | 0.818 | 0.109 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.022 | -0.474 | 1.079 | 0.147 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | 0.000 | -0.760 | 0.819 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.045 | -0.414 | 0.277 | 0.106 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.831 | 0.809 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.002 | -0.544 | 0.244 | 0.082 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.749 | 0.962 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | 0.011 | -0.275 | 0.294 | 0.101 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.990 | 0.168 | 1.270 | 0.152 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.034 | -0.773 | 1.134 | 0.182 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.070 | -2.190 | 5.577 | 0.255 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.004 | 1.113 | 0.152 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.781 | 0.551 | 0.137 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.580 | 0.572 | 0.141 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.001 | -0.554 | 0.820 | 0.177 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.642 | 0.178 | 0.852 | 0.111 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.025 | -0.413 | 0.853 | 0.124 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.000 | -0.780 | 1.141 | 0.143 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.067 | -0.860 | 0.177 | 0.114 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -1.067 | 0.859 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.002 | -0.298 | 0.225 | 0.072 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.726 | 0.809 | 0.151 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.394 | 0.292 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.990 | 0.219 | 1.226 | 0.130 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.032 | -0.837 | 1.156 | 0.168 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.005 | -4.045 | 1.695 | 0.178 | torch.Size([3375, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.855 | 1.101 | 0.153 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.706 | 0.841 | 0.123 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.586 | 0.699 | 0.134 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | 0.001 | -0.402 | 0.842 | 0.173 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.613 | 0.196 | 0.800 | 0.102 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.021 | -0.404 | 0.907 | 0.115 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | 0.000 | -0.718 | 0.654 | 0.138 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.064 | -0.568 | 0.205 | 0.115 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.001 | -0.674 | 0.596 | 0.155 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | -0.012 | -0.279 | 0.171 | 0.073 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.634 | 0.692 | 0.150 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | 0.010 | -0.528 | 1.331 | 0.175 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.361 | 0.549 | 0.078 | torch.Size([180, 180]) || stage8.2.linear.weight + | -0.001 | -0.682 | 0.349 | 0.142 | torch.Size([180]) || stage8.2.linear.bias + | 1.018 | 0.177 | 1.365 | 0.177 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.033 | -0.673 | 0.916 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | 0.003 | -2.963 | 1.620 | 0.138 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -1.095 | 0.939 | 0.152 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | 0.004 | -0.725 | 0.682 | 0.135 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.731 | 0.755 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | 0.013 | -0.457 | 0.481 | 0.158 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.703 | 0.276 | 0.865 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.024 | -0.449 | 0.966 | 0.132 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.001 | -0.873 | 0.665 | 0.138 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.052 | -0.479 | 0.198 | 0.104 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.787 | 0.699 | 0.155 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.003 | -0.436 | 0.264 | 0.081 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.675 | 0.689 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.265 | 0.254 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.956 | 0.184 | 1.255 | 0.167 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.036 | -0.699 | 0.965 | 0.155 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.038 | -3.913 | 4.625 | 0.210 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.142 | 0.934 | 0.147 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.000 | -0.708 | 0.560 | 0.117 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | -0.002 | -0.746 | 0.626 | 0.149 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | 0.021 | -0.378 | 0.376 | 0.127 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.741 | 0.282 | 0.933 | 0.107 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.028 | -0.425 | 0.898 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.001 | -0.761 | 0.822 | 0.139 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.057 | -0.502 | 0.219 | 0.100 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.829 | 0.872 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.262 | 0.226 | 0.077 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.797 | 0.765 | 0.153 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.002 | -0.360 | 0.289 | 0.109 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 1.068 | 0.207 | 1.335 | 0.160 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.034 | -0.784 | 1.005 | 0.163 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.004 | -2.897 | 1.185 | 0.143 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -1.055 | 0.899 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | -0.000 | -0.572 | 0.670 | 0.120 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.729 | 0.798 | 0.156 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | 0.025 | -0.570 | 0.501 | 0.166 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.759 | 0.228 | 0.969 | 0.115 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.025 | -0.394 | 0.791 | 0.103 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.001 | -0.962 | 0.903 | 0.137 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.064 | -0.587 | 0.209 | 0.108 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.966 | 0.925 | 0.156 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.004 | -0.366 | 0.239 | 0.074 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.782 | 0.817 | 0.152 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | 0.003 | -0.321 | 0.340 | 0.117 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 1.082 | 0.237 | 1.309 | 0.144 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.031 | -0.726 | 0.933 | 0.149 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | 0.005 | -3.023 | 1.093 | 0.142 | torch.Size([3375, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.830 | 0.867 | 0.151 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.487 | 0.710 | 0.107 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.940 | 0.725 | 0.157 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | 0.027 | -0.522 | 0.807 | 0.170 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.705 | 0.249 | 0.868 | 0.095 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.023 | -0.426 | 0.826 | 0.108 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.000 | -0.814 | 0.927 | 0.131 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.043 | -0.613 | 0.209 | 0.116 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.709 | 0.851 | 0.154 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.225 | 0.241 | 0.078 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.857 | 0.845 | 0.151 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | 0.016 | -0.441 | 1.206 | 0.183 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.437 | 0.634 | 0.077 | torch.Size([180, 180]) || stage8.3.linear.weight + | -0.003 | -0.564 | 0.338 | 0.145 | torch.Size([180]) || stage8.3.linear.bias + | 1.164 | 0.238 | 1.496 | 0.205 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.033 | -0.667 | 0.780 | 0.170 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.002 | -3.025 | 1.339 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.736 | 0.735 | 0.147 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.007 | -0.468 | 0.575 | 0.112 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.725 | 0.750 | 0.162 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.004 | -0.461 | 0.540 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.804 | 0.361 | 0.962 | 0.091 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.025 | -0.421 | 0.837 | 0.127 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.664 | 0.869 | 0.129 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.028 | -0.519 | 0.180 | 0.098 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.793 | 0.821 | 0.156 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.235 | 0.329 | 0.081 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.758 | 0.730 | 0.153 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | 0.010 | -0.332 | 0.306 | 0.118 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.097 | 0.202 | 1.361 | 0.200 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.034 | -0.597 | 0.687 | 0.147 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.007 | -4.645 | 1.140 | 0.130 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -1.002 | 0.810 | 0.144 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.407 | 0.438 | 0.108 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.646 | 0.678 | 0.154 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.418 | 0.415 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.836 | 0.316 | 1.026 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.024 | -0.364 | 0.851 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.690 | 0.848 | 0.128 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.032 | -0.484 | 0.195 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.863 | 0.768 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | -0.001 | -0.319 | 0.409 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.836 | 0.822 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | 0.019 | -0.356 | 0.374 | 0.129 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.151 | 0.229 | 1.393 | 0.176 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.028 | -0.649 | 0.925 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | -0.005 | -3.864 | 1.138 | 0.140 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -1.813 | 0.897 | 0.146 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.449 | 0.486 | 0.103 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.739 | 0.710 | 0.175 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.000 | -0.542 | 0.407 | 0.162 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.820 | 0.329 | 0.989 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.025 | -0.461 | 0.753 | 0.106 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.001 | -0.648 | 0.788 | 0.125 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.015 | -0.501 | 0.248 | 0.101 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.745 | 0.796 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | 0.007 | -0.244 | 0.231 | 0.080 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.771 | 1.049 | 0.154 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | 0.018 | -0.360 | 0.336 | 0.143 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.177 | 0.269 | 1.385 | 0.163 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.028 | -0.700 | 0.877 | 0.145 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | -0.005 | -2.684 | 0.830 | 0.097 | torch.Size([3375, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1687.000 | 0.000 | 3374.000 | 730.710 | torch.Size([512, 512]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.996 | 0.727 | 0.142 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.326 | 0.449 | 0.101 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.777 | 0.785 | 0.170 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | 0.004 | -0.396 | 0.449 | 0.158 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.790 | 0.392 | 1.005 | 0.078 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.030 | -0.481 | 0.719 | 0.110 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.569 | 0.732 | 0.121 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | 0.020 | -0.670 | 0.335 | 0.125 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.822 | 0.831 | 0.155 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.282 | 0.296 | 0.089 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.856 | 0.886 | 0.155 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | 0.029 | -0.390 | 0.437 | 0.161 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.002 | -0.490 | 0.625 | 0.079 | torch.Size([180, 180]) || stage8.4.linear.weight + | -0.002 | -0.573 | 0.398 | 0.168 | torch.Size([180]) || stage8.4.linear.bias + | 1.337 | 0.163 | 1.694 | 0.268 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.025 | -0.727 | 1.008 | 0.186 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.738 | -2.885 | 5.812 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.852 | 0.854 | 0.135 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | -0.005 | -0.546 | 0.550 | 0.112 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.901 | 0.781 | 0.195 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.020 | -0.545 | 0.469 | 0.173 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.956 | 0.367 | 1.185 | 0.129 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.033 | -0.519 | 0.833 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.001 | -0.832 | 0.580 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | 0.055 | -0.256 | 0.378 | 0.097 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -1.058 | 0.859 | 0.154 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.377 | 0.318 | 0.093 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | -0.001 | -0.751 | 0.766 | 0.156 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.011 | -0.316 | 0.323 | 0.132 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.346 | 0.151 | 1.746 | 0.272 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.023 | -0.691 | 0.993 | 0.169 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.705 | -2.997 | 4.745 | 0.748 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.911 | 0.984 | 0.141 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | -0.011 | -0.405 | 0.288 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.853 | 0.977 | 0.210 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.516 | 0.596 | 0.170 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 1.021 | 0.333 | 1.268 | 0.154 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.034 | -0.512 | 0.812 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | 0.000 | -0.561 | 0.546 | 0.120 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | 0.050 | -0.450 | 0.320 | 0.100 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.001 | -0.907 | 0.752 | 0.157 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | -0.008 | -0.306 | 0.343 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | -0.001 | -0.891 | 0.741 | 0.158 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.014 | -0.407 | 0.478 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.266 | 0.195 | 1.640 | 0.251 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.028 | -0.680 | 0.987 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.515 | -2.839 | 4.668 | 0.636 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | 0.001 | -0.968 | 0.890 | 0.144 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.372 | 0.390 | 0.095 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -1.001 | 0.995 | 0.221 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.012 | -0.576 | 0.456 | 0.172 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 1.046 | 0.311 | 1.264 | 0.147 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.033 | -0.519 | 0.785 | 0.123 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | 0.000 | -0.533 | 0.563 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | 0.053 | -0.314 | 0.364 | 0.109 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.862 | 0.822 | 0.158 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.266 | 0.289 | 0.084 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.001 | -0.787 | 0.886 | 0.161 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.421 | 0.503 | 0.171 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.226 | 0.277 | 1.561 | 0.208 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.032 | -0.670 | 1.030 | 0.168 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.401 | -1.953 | 3.930 | 0.598 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.857 | 0.754 | 0.139 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.004 | -0.317 | 0.278 | 0.081 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | -0.002 | -1.022 | 0.999 | 0.200 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.009 | -0.384 | 0.393 | 0.165 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 1.038 | 0.340 | 1.216 | 0.128 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.034 | -0.574 | 0.775 | 0.124 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | 0.001 | -0.588 | 0.613 | 0.119 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | 0.063 | -0.447 | 0.307 | 0.111 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.873 | 0.775 | 0.159 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.456 | 0.435 | 0.092 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.819 | 0.772 | 0.160 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.018 | -0.319 | 0.340 | 0.131 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.562 | 0.471 | 0.080 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.024 | -0.609 | 0.488 | 0.184 | torch.Size([180]) || stage8.5.linear.bias + | 1.369 | 0.171 | 1.961 | 0.355 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.028 | -0.642 | 0.733 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.029 | -1.759 | 1.624 | 0.312 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.686 | 0.691 | 0.113 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.261 | 0.301 | 0.081 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.736 | 0.637 | 0.149 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | -0.006 | -0.293 | 0.300 | 0.106 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 1.302 | 0.401 | 1.613 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.029 | -0.475 | 0.696 | 0.159 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.001 | -0.649 | 0.564 | 0.119 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.036 | -0.275 | 0.218 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.717 | 0.831 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.006 | -0.231 | 0.270 | 0.074 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.833 | 0.791 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.004 | -0.364 | 0.324 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.450 | 0.218 | 1.962 | 0.354 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.025 | -0.716 | 0.851 | 0.206 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.045 | -1.549 | 2.100 | 0.321 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.759 | 0.636 | 0.110 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.235 | 0.269 | 0.070 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.691 | 0.657 | 0.145 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | -0.007 | -0.375 | 0.328 | 0.116 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 1.326 | 0.335 | 1.596 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.029 | -0.566 | 0.748 | 0.160 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.002 | -0.667 | 0.591 | 0.121 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.042 | -0.387 | 0.373 | 0.078 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.685 | 0.894 | 0.147 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.000 | -0.353 | 0.326 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.801 | 0.692 | 0.149 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.331 | 0.273 | 0.127 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.416 | 0.215 | 1.819 | 0.303 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.024 | -0.596 | 0.869 | 0.211 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.038 | -2.355 | 1.330 | 0.286 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.964 | 0.732 | 0.112 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.192 | 0.251 | 0.052 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.736 | 0.624 | 0.138 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.376 | 0.254 | 0.119 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.352 | 0.217 | 1.546 | 0.187 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.023 | -0.627 | 0.881 | 0.164 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.001 | -0.616 | 0.688 | 0.122 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.040 | -0.332 | 0.242 | 0.083 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.970 | 0.669 | 0.148 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | 0.006 | -0.333 | 0.371 | 0.092 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.849 | 0.824 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.007 | -0.282 | 0.333 | 0.111 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.346 | 0.206 | 1.798 | 0.286 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.022 | -0.742 | 0.797 | 0.196 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.056 | -1.296 | 2.098 | 0.311 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.693 | 0.597 | 0.103 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | -0.003 | -0.211 | 0.161 | 0.055 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.767 | 0.663 | 0.127 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.011 | -0.269 | 0.169 | 0.072 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 1.329 | 0.247 | 1.544 | 0.183 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.023 | -0.619 | 0.881 | 0.171 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | -0.001 | -0.670 | 0.594 | 0.124 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.052 | -0.262 | 0.275 | 0.073 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.899 | 0.808 | 0.149 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.009 | -0.273 | 0.326 | 0.090 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.001 | -0.773 | 0.930 | 0.150 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | -0.001 | -0.264 | 0.261 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -1.128 | 1.483 | 0.100 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.014 | -0.757 | 0.769 | 0.160 | torch.Size([180]) || stage8.6.linear.bias + | 0.387 | 0.109 | 1.033 | 0.194 | torch.Size([180]) || norm.weight + | -0.006 | -0.754 | 0.773 | 0.142 | torch.Size([180]) || norm.bias + | 0.001 | -0.596 | 0.563 | 0.121 | torch.Size([120, 180]) || conv_after_body.weight + | -0.016 | -0.251 | 0.121 | 0.061 | torch.Size([120]) || conv_after_body.bias + | 0.003 | -1.347 | 1.476 | 0.161 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.090 | -0.847 | 0.182 | 0.193 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.002 | -1.602 | 0.994 | 0.114 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.059 | -0.461 | 0.137 | 0.098 | torch.Size([256]) || upsample.0.bias + | -0.005 | -4.099 | 0.822 | 0.076 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.137 | -0.426 | 0.152 | 0.097 | torch.Size([256]) || upsample.5.bias + | -0.000 | -0.377 | 0.324 | 0.014 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | -0.000 | -0.016 | 0.014 | 0.003 | torch.Size([64]) || upsample.10.bias + | -0.000 | -0.043 | 0.040 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias +