diff --git "a/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log" "b/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log" new file mode 100644--- /dev/null +++ "b/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log" @@ -0,0 +1,22331 @@ +22-03-11 09:54:38.123 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: None + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb + dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: lmdb + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/REDS4/GT + dataroot_lq: testsets/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 09:54:38.147 : Number of train images: 27,000, iters: 3,375 +22-03-11 09:54:50.175 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: None + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb + dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: lmdb + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/REDS4/GT + dataroot_lq: testsets/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 09:54:50.223 : Number of train images: 27,000, iters: 3,375 +22-03-11 09:54:57.597 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 09:54:57.779 : + | mean | min | max | std || shape + | 0.000 | -0.064 | 0.064 | 0.037 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | -0.005 | -0.063 | 0.062 | 0.037 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.684 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.055 | -0.917 | 0.306 | 0.335 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.009 | -3.201 | 0.948 | 0.096 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.039 | -1.273 | 0.675 | 0.311 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.690 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.162 | -0.704 | 0.905 | 0.366 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.023 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.787 | -1.061 | 1.170 | 0.522 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.145 | 0.166 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | -0.000 | -0.001 | 0.000 | 0.001 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | -0.000 | -0.726 | 0.782 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.024 | -0.810 | 0.352 | 0.313 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.008 | -3.370 | 0.914 | 0.098 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.042 | -1.197 | 0.699 | 0.302 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.468 | 0.566 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.160 | -0.745 | 0.996 | 0.391 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.017 | -1.648 | 0.317 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.785 | -1.176 | 1.158 | 0.543 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | 0.000 | -0.145 | 0.163 | 0.014 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -1.003 | 0.875 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.021 | -0.979 | 0.466 | 0.373 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.008 | -4.622 | 1.220 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.028 | -1.276 | 0.717 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.007 | -1.827 | 0.624 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.123 | -0.697 | 0.745 | 0.334 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.010 | -1.295 | 0.330 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.677 | -1.696 | 0.934 | 0.637 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.114 | 0.129 | 0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.003 | -0.008 | 0.002 | 0.007 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | 0.000 | -1.053 | 0.952 | 0.091 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.016 | -1.061 | 0.522 | 0.414 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.008 | -4.891 | 1.222 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.029 | -1.264 | 0.760 | 0.309 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.007 | -1.792 | 0.579 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.117 | -0.694 | 0.670 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.008 | -1.108 | 0.324 | 0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.652 | -1.754 | 0.901 | 0.647 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.117 | 0.129 | 0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.002 | -0.003 | 0.007 | 0.007 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -1.085 | 0.998 | 0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | 0.009 | -0.975 | 0.477 | 0.368 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.008 | -5.056 | 1.282 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.029 | -1.240 | 0.796 | 0.311 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.007 | -1.772 | 0.600 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.121 | -0.688 | 0.694 | 0.331 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.007 | -0.980 | 0.320 | 0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.642 | -1.810 | 0.912 | 0.662 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.188 | 0.209 | 0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.002 | -0.008 | 0.005 | 0.009 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -1.085 | 0.999 | 0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.009 | -0.982 | 0.474 | 0.368 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.008 | -5.089 | 1.311 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.029 | -1.256 | 0.804 | 0.314 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.788 | 0.613 | 0.093 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.122 | -0.699 | 0.700 | 0.334 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.008 | -1.010 | 0.323 | 0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.650 | -1.834 | 0.923 | 0.670 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.192 | 0.213 | 0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | -0.001 | -0.007 | 0.005 | 0.009 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.reshape.1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.065 | 0.069 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.090 | 0.091 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | 0.005 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.004 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.050 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | -0.004 | -0.089 | 0.088 | 0.052 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | -0.003 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.070 | 0.070 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | -0.001 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.002 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.003 | -0.091 | 0.089 | 0.051 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | -0.004 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.072 | 0.073 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.038 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | -0.004 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | 0.002 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.091 | 0.089 | 0.051 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.062 | 0.064 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.067 | 0.067 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | -0.008 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | -0.005 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | 0.005 | -0.063 | 0.061 | 0.035 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.079 | 0.068 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | 0.003 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | 0.006 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | 0.006 | -0.087 | 0.091 | 0.050 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.077 | 0.071 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | -0.004 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.000 | -0.089 | 0.089 | 0.050 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | -0.004 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.003 | -0.064 | 0.063 | 0.034 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.linear1.weight + | -0.010 | -0.090 | 0.091 | 0.050 | torch.Size([120]) || stage1.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.079 | 0.088 | 0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.091 | 0.091 | 0.050 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.090 | 0.090 | 0.054 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | 0.001 | -0.089 | 0.091 | 0.054 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.078 | 0.083 | 0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.003 | -0.088 | 0.089 | 0.052 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.000 | -0.091 | 0.091 | 0.048 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | 0.000 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.002 | -0.052 | 0.053 | 0.030 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | -0.001 | -0.053 | 0.053 | 0.031 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | 0.002 | -0.052 | 0.052 | 0.030 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage2.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage2.reshape.1.bias + | 0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage2.reshape.2.weight + | -0.001 | -0.045 | 0.045 | 0.026 | torch.Size([120]) || stage2.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.070 | 0.065 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | 0.003 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.004 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | -0.005 | -0.090 | 0.089 | 0.055 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | 0.003 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.066 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.002 | -0.064 | 0.060 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.001 | -0.091 | 0.088 | 0.054 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | -0.004 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.007 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.068 | 0.075 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.000 | -0.063 | 0.063 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.004 | -0.091 | 0.091 | 0.050 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.001 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.008 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | 0.006 | -0.063 | 0.065 | 0.038 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.095 | 0.063 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.007 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.003 | -0.090 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.003 | -0.089 | 0.090 | 0.050 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.003 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.000 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.070 | 0.081 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | 0.000 | -0.061 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.090 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | 0.003 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | 0.003 | -0.088 | 0.091 | 0.051 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | 0.000 | -0.064 | 0.062 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.072 | 0.077 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | -0.005 | -0.091 | 0.089 | 0.053 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | -0.000 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.089 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | -0.000 | -0.063 | 0.065 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.linear1.weight + | -0.003 | -0.090 | 0.089 | 0.054 | torch.Size([120]) || stage2.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.106 | 0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.091 | 0.091 | 0.050 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | 0.005 | -0.090 | 0.090 | 0.050 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | -0.002 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.062 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | 0.013 | -0.088 | 0.090 | 0.051 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | -0.002 | -0.090 | 0.091 | 0.051 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | 0.004 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.005 | -0.063 | 0.063 | 0.038 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.linear2.weight + | -0.000 | -0.088 | 0.090 | 0.053 | torch.Size([120]) || stage2.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | 0.002 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | 0.001 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | 0.002 | -0.027 | 0.030 | 0.016 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.002 | -0.053 | 0.053 | 0.031 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.001 | -0.053 | 0.052 | 0.030 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.031 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | -0.002 | -0.052 | 0.052 | 0.030 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage3.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage3.reshape.1.bias + | 0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.001 | -0.045 | 0.045 | 0.027 | torch.Size([120]) || stage3.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.072 | 0.071 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | 0.001 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.070 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.003 | -0.060 | 0.064 | 0.035 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.004 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.090 | 0.089 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | -0.002 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.076 | 0.074 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | 0.005 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.001 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.007 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | -0.002 | -0.062 | 0.064 | 0.038 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.073 | 0.065 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | 0.006 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.063 | 0.063 | 0.035 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | 0.002 | -0.091 | 0.088 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | -0.001 | -0.065 | 0.064 | 0.040 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.080 | 0.063 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | 0.001 | -0.064 | 0.062 | 0.040 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.007 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.004 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.062 | 0.063 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.069 | 0.079 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | -0.004 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | 0.005 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.002 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.005 | -0.090 | 0.090 | 0.055 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | -0.000 | -0.091 | 0.089 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | 0.004 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([120, 120]) || stage3.linear1.weight + | 0.003 | -0.091 | 0.091 | 0.054 | torch.Size([120]) || stage3.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.077 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | -0.011 | -0.091 | 0.091 | 0.053 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.008 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.004 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | -0.002 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.088 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | -0.003 | -0.091 | 0.089 | 0.054 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.090 | 0.090 | 0.054 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | 0.002 | -0.089 | 0.091 | 0.051 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | 0.002 | -0.061 | 0.062 | 0.034 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.089 | 0.091 | 0.048 | torch.Size([120]) || stage3.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | 0.000 | -0.021 | 0.021 | 0.011 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | -0.001 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | -0.002 | -0.053 | 0.053 | 0.029 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.005 | -0.053 | 0.052 | 0.030 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | 0.007 | -0.052 | 0.053 | 0.029 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage4.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage4.reshape.1.bias + | -0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage4.reshape.2.weight + | -0.002 | -0.046 | 0.045 | 0.027 | torch.Size([120]) || stage4.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.065 | 0.070 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.004 | -0.091 | 0.090 | 0.055 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.004 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.073 | 0.086 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | 0.003 | -0.065 | 0.063 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.004 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.003 | -0.091 | 0.089 | 0.051 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.091 | 0.089 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | -0.004 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.064 | 0.069 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | -0.004 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.002 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.006 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | 0.004 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.003 | -0.065 | 0.064 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.067 | 0.074 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.064 | 0.064 | 0.042 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.001 | -0.089 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.006 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.074 | 0.077 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.061 | 0.064 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.090 | 0.089 | 0.050 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | -0.001 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.002 | -0.065 | 0.063 | 0.035 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.076 | 0.074 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | -0.001 | -0.063 | 0.064 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.001 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.005 | -0.091 | 0.091 | 0.053 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.066 | 0.086 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | -0.005 | -0.089 | 0.084 | 0.053 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.003 | -0.090 | 0.090 | 0.051 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.090 | 0.089 | 0.054 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.064 | 0.062 | 0.037 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.074 | 0.082 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | 0.004 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.091 | 0.091 | 0.055 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | 0.001 | -0.091 | 0.090 | 0.056 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.064 | 0.062 | 0.036 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.linear2.weight + | 0.006 | -0.091 | 0.090 | 0.057 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.000 | -0.020 | 0.021 | 0.011 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | -0.003 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.001 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.000 | -0.052 | 0.053 | 0.029 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | -0.001 | -0.052 | 0.053 | 0.029 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.002 | -0.053 | 0.051 | 0.029 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage5.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage5.reshape.1.bias + | -0.002 | -0.183 | 0.182 | 0.105 | torch.Size([120, 30]) || stage5.reshape.2.weight + | 0.014 | -0.182 | 0.181 | 0.113 | torch.Size([120]) || stage5.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.073 | 0.066 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.001 | -0.090 | 0.090 | 0.050 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | 0.006 | -0.062 | 0.064 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | -0.001 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | 0.004 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.002 | -0.064 | 0.063 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.073 | 0.082 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | 0.002 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | -0.003 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | -0.000 | -0.063 | 0.062 | 0.036 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.086 | 0.069 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | -0.004 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.063 | 0.064 | 0.040 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.004 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | 0.005 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.064 | 0.063 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.070 | 0.068 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.003 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.003 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.049 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.068 | 0.077 | 0.019 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.063 | 0.063 | 0.040 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.068 | 0.075 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.003 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | 0.001 | -0.063 | 0.063 | 0.034 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.002 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.057 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.001 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | -0.003 | -0.064 | 0.061 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.linear1.weight + | 0.002 | -0.089 | 0.091 | 0.052 | torch.Size([120]) || stage5.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.079 | 0.089 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | 0.002 | -0.090 | 0.090 | 0.049 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | 0.000 | -0.091 | 0.090 | 0.049 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | 0.000 | -0.091 | 0.089 | 0.056 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | 0.003 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | -0.006 | -0.062 | 0.062 | 0.036 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.077 | 0.082 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.007 | -0.090 | 0.091 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | 0.005 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.064 | 0.062 | 0.037 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.linear2.weight + | 0.006 | -0.089 | 0.091 | 0.053 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.002 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | -0.002 | -0.030 | 0.029 | 0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.003 | -0.029 | 0.030 | 0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.002 | -0.052 | 0.052 | 0.030 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | 0.003 | -0.053 | 0.052 | 0.032 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.001 | -0.050 | 0.051 | 0.030 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage6.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage6.reshape.1.bias + | -0.002 | -0.183 | 0.183 | 0.107 | torch.Size([120, 30]) || stage6.reshape.2.weight + | -0.007 | -0.178 | 0.182 | 0.107 | torch.Size([120]) || stage6.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.073 | 0.070 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.002 | -0.089 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | 0.001 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.005 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | -0.001 | -0.065 | 0.064 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.068 | 0.071 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | -0.004 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | -0.005 | -0.064 | 0.061 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | 0.004 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | 0.004 | -0.091 | 0.090 | 0.048 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.002 | -0.063 | 0.064 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.065 | 0.067 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.004 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.005 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.005 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.062 | 0.064 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.068 | 0.077 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | 0.004 | -0.090 | 0.091 | 0.050 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.063 | 0.063 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.008 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | 0.002 | -0.089 | 0.089 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | 0.005 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.086 | 0.071 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.004 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.008 | -0.088 | 0.091 | 0.055 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.074 | 0.065 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | 0.001 | -0.065 | 0.063 | 0.039 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.005 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | 0.000 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.001 | -0.091 | 0.090 | 0.051 | torch.Size([120]) || stage6.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.075 | 0.086 | 0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.090 | 0.090 | 0.053 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | -0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.079 | 0.081 | 0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | 0.005 | -0.089 | 0.090 | 0.054 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | 0.000 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.090 | 0.090 | 0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.linear2.weight + | -0.004 | -0.091 | 0.091 | 0.051 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | 0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.004 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | -0.000 | -0.053 | 0.052 | 0.032 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.005 | -0.051 | 0.052 | 0.030 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage7.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage7.reshape.1.bias + | -0.001 | -0.182 | 0.182 | 0.106 | torch.Size([120, 30]) || stage7.reshape.2.weight + | 0.005 | -0.178 | 0.181 | 0.109 | torch.Size([120]) || stage7.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.064 | 0.075 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | -0.004 | -0.091 | 0.090 | 0.051 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | 0.002 | -0.063 | 0.064 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | -0.004 | -0.064 | 0.062 | 0.038 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.075 | 0.075 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.001 | -0.063 | 0.064 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.005 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.004 | -0.064 | 0.062 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.063 | 0.092 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.004 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | -0.000 | -0.064 | 0.062 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.091 | 0.089 | 0.055 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.002 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.083 | 0.079 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.001 | -0.091 | 0.090 | 0.051 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | -0.001 | -0.062 | 0.064 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | -0.003 | -0.061 | 0.064 | 0.035 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.077 | 0.084 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | -0.005 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.089 | 0.090 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.063 | 0.062 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.071 | 0.078 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.011 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.003 | -0.091 | 0.090 | 0.050 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.090 | 0.090 | 0.051 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | -0.002 | -0.064 | 0.062 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.linear1.weight + | -0.005 | -0.089 | 0.090 | 0.055 | torch.Size([120]) || stage7.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.090 | 0.091 | 0.053 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | 0.002 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.060 | 0.062 | 0.036 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.086 | 0.077 | 0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.089 | 0.089 | 0.053 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | 0.005 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([120, 120]) || stage7.linear2.weight + | -0.007 | -0.090 | 0.090 | 0.051 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | 0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.001 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.001 | -0.030 | 0.028 | 0.017 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | 0.000 | -0.053 | 0.052 | 0.031 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | 0.002 | -0.052 | 0.053 | 0.029 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage8.0.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage8.0.1.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([180, 120]) || stage8.0.2.weight + | 0.005 | -0.090 | 0.090 | 0.050 | torch.Size([180]) || stage8.0.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | 0.000 | -0.078 | 0.076 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | 0.003 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.074 | 0.075 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | -0.000 | -0.078 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | -0.003 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | 0.003 | -0.073 | 0.074 | 0.045 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | 0.000 | -0.075 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.033 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.000 | -0.081 | 0.076 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.004 | -0.074 | 0.074 | 0.041 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.031 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | 0.000 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | 0.000 | -0.084 | 0.092 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | -0.001 | -0.074 | 0.075 | 0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.003 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.003 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | 0.003 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.linear.weight + | 0.002 | -0.073 | 0.074 | 0.043 | torch.Size([180]) || stage8.1.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | -0.000 | -0.077 | 0.071 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | -0.002 | -0.073 | 0.074 | 0.044 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | -0.001 | -0.074 | 0.075 | 0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.000 | -0.051 | 0.053 | 0.029 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.000 | -0.081 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | 0.004 | -0.073 | 0.074 | 0.043 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | 0.000 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | 0.002 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.000 | -0.081 | 0.071 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.074 | 0.073 | 0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | 0.001 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.075 | 0.074 | 0.045 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | 0.002 | -0.052 | 0.051 | 0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | 0.000 | -0.075 | 0.073 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | 0.003 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | 0.000 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | -0.005 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.000 | -0.074 | 0.073 | 0.044 | torch.Size([180]) || stage8.2.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.000 | -0.083 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.005 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | 0.004 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.003 | -0.073 | 0.074 | 0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | 0.004 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.001 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.000 | -0.073 | 0.087 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | -0.002 | -0.074 | 0.073 | 0.042 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.001 | -0.075 | 0.075 | 0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.002 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | 0.000 | -0.085 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | -0.003 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | 0.000 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.000 | -0.074 | 0.075 | 0.045 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.051 | 0.051 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | 0.000 | -0.081 | 0.082 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | -0.000 | -0.075 | 0.074 | 0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | -0.001 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | 0.003 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.000 | -0.074 | 0.075 | 0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.linear.weight + | -0.001 | -0.073 | 0.074 | 0.042 | torch.Size([180]) || stage8.3.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.000 | -0.082 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | 0.002 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | 0.004 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.001 | -0.050 | 0.052 | 0.029 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.000 | -0.083 | 0.083 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | -0.003 | -0.074 | 0.073 | 0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | 0.005 | -0.073 | 0.072 | 0.041 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | 0.003 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | 0.003 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | -0.000 | -0.075 | 0.081 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.002 | -0.075 | 0.074 | 0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | 0.002 | -0.053 | 0.052 | 0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | -0.000 | -0.083 | 0.072 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | -0.004 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | 0.004 | -0.074 | 0.072 | 0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | 0.007 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.073 | 0.075 | 0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.002 | -0.052 | 0.053 | 0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.linear.weight + | -0.008 | -0.075 | 0.072 | 0.039 | torch.Size([180]) || stage8.4.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.000 | -0.058 | 0.058 | 0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | 0.001 | -0.073 | 0.075 | 0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | -0.000 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.002 | -0.051 | 0.051 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.000 | -0.063 | 0.060 | 0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | 0.001 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.072 | 0.073 | 0.041 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | 0.000 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.000 | -0.062 | 0.058 | 0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | -0.000 | -0.075 | 0.074 | 0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.001 | -0.073 | 0.074 | 0.042 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | 0.005 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.000 | -0.074 | 0.073 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | 0.005 | -0.050 | 0.053 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | 0.001 | -0.063 | 0.061 | 0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | -0.004 | -0.074 | 0.075 | 0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | 0.004 | -0.074 | 0.074 | 0.040 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | 0.001 | -0.075 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | 0.003 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.linear.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.5.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.000 | -0.064 | 0.077 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.075 | 0.074 | 0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | 0.002 | -0.073 | 0.074 | 0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | -0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.002 | -0.051 | 0.052 | 0.032 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | 0.000 | -0.074 | 0.067 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.041 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | -0.000 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.075 | 0.074 | 0.042 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.031 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.052 | 0.053 | 0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | 0.001 | -0.071 | 0.075 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.075 | 0.074 | 0.044 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.002 | -0.073 | 0.074 | 0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.004 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.074 | 0.074 | 0.041 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.003 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.000 | -0.060 | 0.066 | 0.021 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.002 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.003 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.074 | 0.075 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.linear.weight + | -0.009 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.6.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || norm.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || norm.bias + | -0.001 | -0.075 | 0.075 | 0.043 | torch.Size([120, 180]) || conv_after_body.weight + | -0.002 | -0.074 | 0.074 | 0.044 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | 0.000 | -0.029 | 0.030 | 0.016 | torch.Size([64]) || conv_before_upsample.0.bias + | -0.000 | -0.042 | 0.042 | 0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | 0.000 | -0.041 | 0.042 | 0.024 | torch.Size([256]) || upsample.0.bias + | -0.000 | -0.042 | 0.042 | 0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | 0.000 | -0.041 | 0.040 | 0.025 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.042 | 0.042 | 0.024 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.003 | -0.041 | 0.041 | 0.025 | torch.Size([64]) || upsample.10.bias + | -0.000 | -0.042 | 0.042 | 0.024 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.001 | -0.039 | 0.037 | 0.038 | torch.Size([3]) || conv_last.bias + +22-03-11 09:55:18.025 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: None + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb + dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: lmdb + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: testsets/REDS4/GT + dataroot_lq: testsets/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 09:55:18.071 : Number of train images: 27,000, iters: 3,375 +22-03-11 09:55:21.359 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 09:55:21.536 : + | mean | min | max | std || shape + | 0.000 | -0.064 | 0.064 | 0.037 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | 0.000 | -0.062 | 0.064 | 0.037 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.684 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.055 | -0.917 | 0.306 | 0.335 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.009 | -3.201 | 0.948 | 0.096 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.039 | -1.273 | 0.675 | 0.311 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.690 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.162 | -0.704 | 0.905 | 0.366 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.023 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.787 | -1.061 | 1.170 | 0.522 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.145 | 0.166 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | -0.000 | -0.001 | 0.000 | 0.001 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | -0.000 | -0.726 | 0.782 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.024 | -0.810 | 0.352 | 0.313 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.008 | -3.370 | 0.914 | 0.098 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.042 | -1.197 | 0.699 | 0.302 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.468 | 0.566 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.160 | -0.745 | 0.996 | 0.391 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.017 | -1.648 | 0.317 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.785 | -1.176 | 1.158 | 0.543 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | 0.000 | -0.145 | 0.163 | 0.014 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -1.003 | 0.875 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.021 | -0.979 | 0.466 | 0.373 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.008 | -4.622 | 1.220 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.028 | -1.276 | 0.717 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.007 | -1.827 | 0.624 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.123 | -0.697 | 0.745 | 0.334 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.010 | -1.295 | 0.330 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.677 | -1.696 | 0.934 | 0.637 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.114 | 0.129 | 0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.003 | -0.008 | 0.002 | 0.007 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | 0.000 | -1.053 | 0.952 | 0.091 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.016 | -1.061 | 0.522 | 0.414 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.008 | -4.891 | 1.222 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.029 | -1.264 | 0.760 | 0.309 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.007 | -1.792 | 0.579 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.117 | -0.694 | 0.670 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.008 | -1.108 | 0.324 | 0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.652 | -1.754 | 0.901 | 0.647 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.117 | 0.129 | 0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.002 | -0.003 | 0.007 | 0.007 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -1.085 | 0.998 | 0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | 0.009 | -0.975 | 0.477 | 0.368 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.008 | -5.056 | 1.282 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.029 | -1.240 | 0.796 | 0.311 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.007 | -1.772 | 0.600 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.121 | -0.688 | 0.694 | 0.331 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.007 | -0.980 | 0.320 | 0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.642 | -1.810 | 0.912 | 0.662 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.188 | 0.209 | 0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.002 | -0.008 | 0.005 | 0.009 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -1.085 | 0.999 | 0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.009 | -0.982 | 0.474 | 0.368 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.008 | -5.089 | 1.311 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.029 | -1.256 | 0.804 | 0.314 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.788 | 0.613 | 0.093 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.122 | -0.699 | 0.700 | 0.334 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.008 | -1.010 | 0.323 | 0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.650 | -1.834 | 0.923 | 0.670 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.192 | 0.213 | 0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | -0.001 | -0.007 | 0.005 | 0.009 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.reshape.1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.069 | 0.063 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.063 | 0.065 | 0.035 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | 0.003 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.003 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | 0.004 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.066 | 0.076 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | 0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | 0.001 | -0.065 | 0.064 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.005 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.001 | -0.074 | 0.067 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | -0.004 | -0.090 | 0.091 | 0.051 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | 0.008 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.063 | 0.062 | 0.034 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.068 | 0.072 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.005 | -0.060 | 0.063 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | 0.004 | -0.089 | 0.091 | 0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.090 | 0.091 | 0.055 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.062 | 0.063 | 0.034 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.080 | 0.073 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.000 | -0.090 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | -0.007 | -0.090 | 0.089 | 0.048 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | -0.001 | -0.091 | 0.088 | 0.055 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | 0.003 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.066 | 0.077 | 0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | 0.005 | -0.065 | 0.064 | 0.041 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.003 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.003 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([120, 120]) || stage1.linear1.weight + | -0.001 | -0.090 | 0.091 | 0.057 | torch.Size([120]) || stage1.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.074 | 0.073 | 0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | 0.001 | -0.090 | 0.089 | 0.051 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.009 | -0.090 | 0.090 | 0.051 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.004 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | 0.001 | -0.064 | 0.063 | 0.035 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.093 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.003 | -0.090 | 0.091 | 0.056 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.004 | -0.091 | 0.089 | 0.054 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | 0.007 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.005 | -0.091 | 0.086 | 0.052 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | 0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.000 | -0.030 | 0.029 | 0.019 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | -0.001 | -0.053 | 0.053 | 0.031 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.001 | -0.051 | 0.053 | 0.030 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | 0.000 | -0.052 | 0.053 | 0.032 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage2.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage2.reshape.1.bias + | 0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage2.reshape.2.weight + | -0.001 | -0.044 | 0.043 | 0.026 | torch.Size([120]) || stage2.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.067 | 0.061 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | 0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.006 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.009 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | -0.003 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.001 | -0.063 | 0.062 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.001 | -0.070 | 0.072 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | 0.002 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.090 | 0.050 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.013 | -0.090 | 0.090 | 0.052 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.076 | 0.073 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.001 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.006 | -0.090 | 0.090 | 0.051 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | -0.002 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.084 | 0.068 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | 0.005 | -0.086 | 0.090 | 0.052 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.070 | 0.072 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | 0.006 | -0.058 | 0.064 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | -0.002 | -0.089 | 0.091 | 0.051 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | 0.006 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.070 | 0.080 | 0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.091 | 0.090 | 0.050 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | -0.000 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | 0.004 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | 0.002 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.005 | -0.091 | 0.091 | 0.055 | torch.Size([120]) || stage2.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.079 | 0.073 | 0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.091 | 0.088 | 0.052 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.064 | 0.063 | 0.035 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.076 | 0.082 | 0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | 0.001 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | 0.002 | -0.065 | 0.064 | 0.037 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.000 | -0.088 | 0.091 | 0.053 | torch.Size([120]) || stage2.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage2.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.002 | -0.030 | 0.029 | 0.017 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.001 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | -0.002 | -0.053 | 0.052 | 0.030 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.001 | -0.052 | 0.053 | 0.031 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.031 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.001 | -0.045 | 0.051 | 0.029 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage3.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.001 | -0.045 | 0.045 | 0.028 | torch.Size([120]) || stage3.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.075 | 0.073 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | 0.003 | -0.061 | 0.063 | 0.038 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.003 | -0.091 | 0.089 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | -0.002 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.076 | 0.078 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.004 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.002 | -0.061 | 0.060 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.090 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | 0.001 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | 0.006 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.072 | 0.067 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.003 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.002 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | 0.004 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | -0.006 | -0.063 | 0.063 | 0.037 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.071 | 0.069 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.006 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.006 | -0.090 | 0.090 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | 0.004 | -0.064 | 0.061 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.073 | 0.069 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | 0.006 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.072 | 0.077 | 0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.089 | 0.090 | 0.049 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | -0.006 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.005 | -0.090 | 0.091 | 0.054 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | 0.000 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | -0.002 | -0.064 | 0.063 | 0.036 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage3.linear1.weight + | -0.002 | -0.091 | 0.091 | 0.052 | torch.Size([120]) || stage3.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.095 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.090 | 0.091 | 0.049 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | -0.003 | -0.064 | 0.063 | 0.039 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.081 | 0.070 | 0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | -0.000 | -0.091 | 0.091 | 0.054 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | 0.004 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.005 | -0.090 | 0.091 | 0.054 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.001 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.005 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.001 | -0.089 | 0.091 | 0.051 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage3.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.002 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | 0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | -0.001 | -0.052 | 0.052 | 0.030 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.001 | -0.052 | 0.053 | 0.030 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | 0.007 | -0.051 | 0.052 | 0.030 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([480]) || stage4.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([480]) || stage4.reshape.1.bias + | -0.000 | -0.046 | 0.046 | 0.026 | torch.Size([120, 480]) || stage4.reshape.2.weight + | 0.003 | -0.045 | 0.045 | 0.028 | torch.Size([120]) || stage4.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.068 | 0.084 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | 0.006 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.003 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.090 | 0.091 | 0.051 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.004 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | 0.001 | -0.090 | 0.089 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | -0.002 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.076 | 0.082 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | -0.001 | -0.064 | 0.063 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.002 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.005 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | 0.006 | -0.090 | 0.090 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.062 | 0.064 | 0.036 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.071 | 0.082 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.063 | 0.064 | 0.041 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.003 | -0.091 | 0.089 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | 0.006 | -0.091 | 0.090 | 0.050 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | -0.000 | -0.088 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | -0.002 | -0.064 | 0.063 | 0.040 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.083 | 0.065 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.001 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.005 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.064 | 0.062 | 0.034 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.078 | 0.072 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.004 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.005 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | -0.004 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.005 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.079 | 0.076 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.050 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | -0.002 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.005 | -0.090 | 0.089 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | -0.003 | -0.063 | 0.063 | 0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.004 | -0.089 | 0.090 | 0.054 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.081 | 0.077 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | -0.005 | -0.090 | 0.091 | 0.051 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.003 | -0.088 | 0.091 | 0.052 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | -0.004 | -0.064 | 0.065 | 0.039 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.074 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.091 | 0.090 | 0.050 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.005 | -0.090 | 0.088 | 0.053 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.003 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | -0.005 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage4.linear2.weight + | -0.001 | -0.091 | 0.087 | 0.054 | torch.Size([120]) || stage4.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | 0.001 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.001 | -0.030 | 0.029 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | 0.001 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | -0.001 | -0.053 | 0.052 | 0.031 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | 0.001 | -0.053 | 0.052 | 0.031 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | 0.003 | -0.053 | 0.052 | 0.029 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage5.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage5.reshape.1.bias + | 0.001 | -0.182 | 0.182 | 0.106 | torch.Size([120, 30]) || stage5.reshape.2.weight + | 0.009 | -0.178 | 0.182 | 0.107 | torch.Size([120]) || stage5.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | 0.000 | -0.067 | 0.075 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | 0.001 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | 0.002 | -0.063 | 0.064 | 0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.005 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | 0.004 | -0.090 | 0.090 | 0.052 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | -0.004 | -0.091 | 0.090 | 0.055 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.005 | -0.064 | 0.062 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.073 | 0.071 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | -0.002 | -0.064 | 0.061 | 0.035 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.090 | 0.050 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | 0.002 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | 0.006 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.007 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.074 | 0.089 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | 0.001 | -0.062 | 0.064 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.090 | 0.089 | 0.052 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.063 | 0.064 | 0.037 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.065 | 0.082 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.004 | -0.062 | 0.062 | 0.035 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.091 | 0.087 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.072 | 0.079 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.003 | -0.063 | 0.062 | 0.035 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | 0.005 | -0.091 | 0.091 | 0.055 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.063 | 0.064 | 0.036 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.068 | 0.070 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | -0.007 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | 0.002 | -0.091 | 0.090 | 0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.004 | -0.091 | 0.091 | 0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | -0.001 | -0.064 | 0.064 | 0.040 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.linear1.weight + | -0.002 | -0.090 | 0.091 | 0.057 | torch.Size([120]) || stage5.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.101 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.090 | 0.091 | 0.053 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | 0.006 | -0.090 | 0.091 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.004 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.091 | 0.090 | 0.050 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | -0.001 | -0.064 | 0.063 | 0.039 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.087 | 0.084 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.089 | 0.091 | 0.053 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.002 | -0.091 | 0.091 | 0.050 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | -0.003 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.062 | 0.064 | 0.039 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage5.linear2.weight + | -0.013 | -0.088 | 0.083 | 0.050 | torch.Size([120]) || stage5.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | 0.001 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | -0.001 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.000 | -0.053 | 0.053 | 0.031 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | 0.001 | -0.053 | 0.053 | 0.030 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.006 | -0.050 | 0.051 | 0.028 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage6.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage6.reshape.1.bias + | -0.002 | -0.182 | 0.183 | 0.106 | torch.Size([120, 30]) || stage6.reshape.2.weight + | -0.008 | -0.181 | 0.180 | 0.110 | torch.Size([120]) || stage6.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.069 | 0.069 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | 0.002 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | -0.005 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.002 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.007 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | -0.001 | -0.064 | 0.064 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.068 | 0.074 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | 0.004 | -0.090 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.065 | 0.062 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | 0.001 | -0.091 | 0.090 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | -0.002 | -0.090 | 0.090 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.002 | -0.064 | 0.063 | 0.039 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.080 | 0.079 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | -0.001 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | 0.010 | -0.065 | 0.064 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | 0.004 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.004 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.069 | 0.074 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | -0.005 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | -0.004 | -0.088 | 0.087 | 0.047 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | -0.000 | -0.062 | 0.064 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.065 | 0.074 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.007 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.051 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.006 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.000 | -0.062 | 0.064 | 0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.069 | 0.075 | 0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.004 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | -0.001 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.003 | -0.090 | 0.090 | 0.055 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | 0.002 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | -0.001 | -0.064 | 0.065 | 0.038 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.linear1.weight + | -0.005 | -0.089 | 0.091 | 0.055 | torch.Size([120]) || stage6.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.077 | 0.081 | 0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | 0.003 | -0.090 | 0.090 | 0.046 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.000 | -0.090 | 0.089 | 0.054 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.003 | -0.091 | 0.089 | 0.052 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | -0.000 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.079 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.091 | 0.091 | 0.055 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.001 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.090 | 0.090 | 0.057 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -0.064 | 0.064 | 0.035 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage6.linear2.weight + | 0.002 | -0.091 | 0.091 | 0.055 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.013 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.001 | -0.030 | 0.030 | 0.019 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | -0.001 | -0.029 | 0.029 | 0.017 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | -0.001 | -0.053 | 0.053 | 0.030 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.000 | -0.052 | 0.053 | 0.031 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.000 | -0.051 | 0.052 | 0.031 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([30]) || stage7.reshape.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([30]) || stage7.reshape.1.bias + | 0.001 | -0.183 | 0.182 | 0.106 | torch.Size([120, 30]) || stage7.reshape.2.weight + | -0.004 | -0.178 | 0.182 | 0.104 | torch.Size([120]) || stage7.reshape.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.061 | 0.074 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | -0.002 | -0.064 | 0.064 | 0.034 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.001 | -0.090 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | -0.002 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | -0.002 | -0.064 | 0.064 | 0.039 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.000 | -0.069 | 0.071 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | -0.007 | -0.064 | 0.063 | 0.035 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.091 | 0.055 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.003 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | -0.002 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.006 | -0.064 | 0.059 | 0.038 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.083 | 0.070 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.001 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.061 | 0.064 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.006 | -0.091 | 0.091 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.001 | -0.090 | 0.091 | 0.055 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.000 | -0.090 | 0.090 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | -0.000 | -0.064 | 0.063 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.066 | 0.069 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | -0.000 | -0.064 | 0.064 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.004 | -0.091 | 0.090 | 0.051 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.002 | -0.090 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | -0.003 | -0.091 | 0.090 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | -0.001 | -0.064 | 0.062 | 0.039 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.081 | 0.067 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | -0.002 | -0.091 | 0.089 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.063 | 0.063 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.090 | 0.089 | 0.054 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | 0.005 | -0.090 | 0.091 | 0.051 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.063 | 0.063 | 0.037 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | 0.000 | -0.070 | 0.076 | 0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | 0.004 | -0.091 | 0.090 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.001 | -0.063 | 0.063 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.008 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | 0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | -0.003 | -0.091 | 0.091 | 0.054 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | -0.004 | -0.062 | 0.064 | 0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.linear1.weight + | -0.007 | -0.091 | 0.090 | 0.051 | torch.Size([120]) || stage7.linear1.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.090 | 0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.091 | 0.090 | 0.054 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | -0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.090 | 0.087 | 0.055 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | 0.001 | -0.091 | 0.088 | 0.051 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | 0.001 | -0.091 | 0.091 | 0.052 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | 0.003 | -0.063 | 0.064 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.079 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.091 | 0.090 | 0.052 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | 0.007 | -0.090 | 0.090 | 0.052 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.091 | 0.091 | 0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | 0.001 | -0.091 | 0.090 | 0.052 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.065 | 0.065 | 0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | 0.005 | -0.060 | 0.064 | 0.036 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([120, 120]) || stage7.linear2.weight + | -0.009 | -0.087 | 0.087 | 0.048 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.021 | 0.021 | 0.012 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | 0.002 | -0.020 | 0.021 | 0.012 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | 0.000 | -0.030 | 0.030 | 0.016 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.000 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.000 | -0.052 | 0.052 | 0.029 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | 0.002 | -0.053 | 0.053 | 0.031 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.031 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage8.0.1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([120]) || stage8.0.1.bias + | 0.000 | -0.091 | 0.091 | 0.053 | torch.Size([180, 120]) || stage8.0.2.weight + | -0.001 | -0.090 | 0.090 | 0.053 | torch.Size([180]) || stage8.0.2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | 0.000 | -0.075 | 0.081 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | -0.000 | -0.075 | 0.074 | 0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | 0.001 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | 0.001 | -0.075 | 0.074 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.000 | -0.052 | 0.053 | 0.032 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | 0.000 | -0.073 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | 0.003 | -0.073 | 0.074 | 0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.000 | -0.075 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | -0.002 | -0.074 | 0.073 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.031 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.029 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | 0.000 | -0.072 | 0.078 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | 0.000 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | -0.001 | -0.074 | 0.073 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | 0.002 | -0.049 | 0.053 | 0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | -0.000 | -0.071 | 0.085 | 0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | 0.002 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | 0.000 | -0.073 | 0.074 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | -0.005 | -0.053 | 0.052 | 0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.1.linear.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.1.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | 0.000 | -0.075 | 0.080 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | 0.001 | -0.072 | 0.074 | 0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.002 | -0.074 | 0.073 | 0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | -0.000 | -0.074 | 0.074 | 0.041 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.002 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | 0.000 | -0.084 | 0.071 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.040 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.002 | -0.074 | 0.070 | 0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.000 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.001 | -0.075 | 0.073 | 0.041 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.053 | 0.052 | 0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.000 | -0.086 | 0.076 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | -0.001 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | 0.002 | -0.073 | 0.074 | 0.041 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | 0.000 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.031 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | 0.002 | -0.053 | 0.053 | 0.031 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | 0.000 | -0.078 | 0.070 | 0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | -0.002 | -0.074 | 0.075 | 0.046 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | 0.002 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.004 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.2.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.000 | -0.087 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.074 | 0.075 | 0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | 0.004 | -0.072 | 0.074 | 0.041 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | 0.000 | -0.073 | 0.074 | 0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.000 | -0.053 | 0.052 | 0.031 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | 0.000 | -0.074 | 0.073 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | 0.002 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.053 | 0.051 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.000 | -0.085 | 0.087 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.075 | 0.074 | 0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | -0.005 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | 0.004 | -0.074 | 0.075 | 0.045 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.074 | 0.071 | 0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.052 | 0.053 | 0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | -0.000 | -0.077 | 0.093 | 0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | 0.002 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | 0.002 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | -0.001 | -0.052 | 0.053 | 0.032 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.3.linear.weight + | 0.002 | -0.074 | 0.073 | 0.042 | torch.Size([180]) || stage8.3.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | 0.000 | -0.074 | 0.082 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | 0.003 | -0.074 | 0.074 | 0.042 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.074 | 0.075 | 0.045 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | 0.002 | -0.073 | 0.074 | 0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.001 | -0.053 | 0.053 | 0.029 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.000 | -0.077 | 0.076 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | -0.003 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | -0.004 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | -0.002 | -0.074 | 0.074 | 0.045 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | 0.003 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | -0.000 | -0.075 | 0.073 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | 0.002 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.000 | -0.074 | 0.074 | 0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | 0.002 | -0.074 | 0.074 | 0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | -0.001 | -0.074 | 0.073 | 0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.053 | 0.053 | 0.030 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | 0.000 | -0.082 | 0.087 | 0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | 0.003 | -0.074 | 0.073 | 0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.003 | -0.073 | 0.074 | 0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.002 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.4.linear.weight + | 0.000 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.4.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.000 | -0.060 | 0.059 | 0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.003 | -0.074 | 0.072 | 0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.000 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | -0.000 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | 0.001 | -0.059 | 0.062 | 0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.003 | -0.075 | 0.075 | 0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.002 | -0.074 | 0.074 | 0.041 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | -0.005 | -0.074 | 0.074 | 0.045 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.053 | 0.052 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.001 | -0.074 | 0.060 | 0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | -0.002 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.001 | -0.073 | 0.073 | 0.045 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | -0.004 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.075 | 0.075 | 0.044 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.002 | -0.053 | 0.052 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.000 | -0.064 | 0.085 | 0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | 0.000 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.002 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.5.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | 0.000 | -0.064 | 0.057 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.074 | 0.074 | 0.042 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | -0.003 | -0.075 | 0.073 | 0.042 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.001 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | -0.001 | -0.074 | 0.072 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.001 | -0.052 | 0.052 | 0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | 0.001 | -0.061 | 0.074 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.000 | -0.074 | 0.074 | 0.044 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | 0.001 | -0.073 | 0.070 | 0.042 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | 0.001 | -0.052 | 0.053 | 0.032 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.000 | -0.059 | 0.058 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.001 | -0.074 | 0.074 | 0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.004 | -0.074 | 0.074 | 0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.005 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.074 | 0.075 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | 0.001 | -0.051 | 0.051 | 0.030 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | 0.000 | -0.070 | 0.061 | 0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.074 | 0.075 | 0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.000 | -0.072 | 0.074 | 0.044 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.002 | -0.074 | 0.075 | 0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.074 | 0.074 | 0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.053 | 0.053 | 0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.001 | -0.052 | 0.053 | 0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.075 | 0.075 | 0.043 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.002 | -0.073 | 0.074 | 0.042 | torch.Size([180]) || stage8.6.linear.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([180]) || norm.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([180]) || norm.bias + | 0.000 | -0.075 | 0.075 | 0.043 | torch.Size([120, 180]) || conv_after_body.weight + | 0.004 | -0.071 | 0.072 | 0.043 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -0.030 | 0.030 | 0.018 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.003 | -0.029 | 0.029 | 0.018 | torch.Size([64]) || conv_before_upsample.0.bias + | -0.000 | -0.042 | 0.042 | 0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.001 | -0.042 | 0.041 | 0.023 | torch.Size([256]) || upsample.0.bias + | -0.000 | -0.042 | 0.042 | 0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.001 | -0.041 | 0.041 | 0.023 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.042 | 0.042 | 0.024 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.006 | -0.038 | 0.041 | 0.022 | torch.Size([64]) || upsample.10.bias + | 0.001 | -0.042 | 0.042 | 0.024 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.011 | -0.006 | 0.025 | 0.016 | torch.Size([3]) || conv_last.bias + +22-03-11 10:16:36.045 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: None + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/val/val_sharp + dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic + meta_info_file: + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:19:49.922 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/ + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/val/val_sharp + dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic + meta_info_file: + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:21:14.310 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/ + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/val/val_sharp + dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:21:14.354 : Number of train images: 27,000, iters: 3,375 +22-03-11 10:22:14.208 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/val/val_sharp + dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:22:14.252 : Number of train images: 27,000, iters: 3,375 +22-03-11 10:22:28.605 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:22:28.777 : + | mean | min | max | std || shape + | -0.000 | -1.462 | 1.580 | 0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | 0.005 | -0.950 | 0.885 | 0.268 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.679 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.042 | -0.894 | 0.351 | 0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.008 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.059 | -1.268 | 0.732 | 0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.159 | -0.704 | 0.859 | 0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.780 | -1.061 | 1.162 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.144 | 0.163 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.001 | -0.003 | 0.005 | 0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.726 | 0.773 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.021 | -0.814 | 0.355 | 0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.380 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.038 | -1.207 | 0.714 | 0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.462 | 0.549 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.157 | -0.742 | 0.980 | 0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.648 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.775 | -1.195 | 1.148 | 0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | -0.000 | -0.122 | 0.152 | 0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | -0.000 | -0.002 | 0.001 | 0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.956 | 0.870 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.025 | -1.040 | 0.512 | 0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.195 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.023 | -1.284 | 0.699 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.009 | -1.831 | 0.616 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.120 | -0.695 | 0.755 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.013 | -1.285 | 0.304 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.681 | -1.725 | 0.942 | 0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.045 | 0.071 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.009 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.995 | 0.879 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.040 | -1.137 | 0.617 | 0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.891 | 1.224 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.022 | -1.287 | 0.745 | 0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.802 | 0.561 | 0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.118 | -0.694 | 0.697 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.107 | 0.306 | 0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.658 | -1.792 | 0.905 | 0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.030 | 0.037 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.003 | -0.001 | 0.007 | 0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.990 | 0.880 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.010 | -1.067 | 0.596 | 0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.061 | 1.229 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.024 | -1.274 | 0.830 | 0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.787 | 0.563 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.130 | -0.685 | 0.743 | 0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.973 | 0.292 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.659 | -1.855 | 0.931 | 0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.040 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.009 | 0.007 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.973 | 0.853 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.022 | -1.001 | 0.571 | 0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.095 | 1.251 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.026 | -1.305 | 0.880 | 0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.815 | 0.561 | 0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.137 | -0.711 | 0.771 | 0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.010 | -0.986 | 0.286 | 0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.671 | -1.913 | 0.966 | 0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.034 | 0.028 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.002 | -0.013 | 0.016 | 0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.280 | 0.669 | 1.862 | 0.274 | torch.Size([120]) || stage1.reshape.1.weight + | -0.006 | -0.324 | 0.337 | 0.106 | torch.Size([120]) || stage1.reshape.1.bias + | 0.579 | 0.129 | 1.064 | 0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.039 | -1.100 | 0.894 | 0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.134 | -4.020 | 2.585 | 0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.579 | 0.618 | 0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.000 | -0.319 | 0.279 | 0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.634 | 0.686 | 0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.014 | -0.222 | 0.642 | 0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.066 | 0.928 | 0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.146 | 0.190 | 0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.781 | 0.367 | 1.203 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.029 | -0.378 | 0.545 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.687 | 0.753 | 0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | -0.010 | -0.229 | 0.633 | 0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.674 | 0.669 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.448 | 0.368 | 0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.862 | 0.941 | 0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | -0.004 | -0.267 | 0.594 | 0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.797 | 0.211 | 1.475 | 0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.161 | -1.941 | 0.746 | 0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.296 | -3.927 | 2.840 | 0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.001 | -1.479 | 1.395 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | -0.003 | -0.381 | 0.258 | 0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.526 | 0.561 | 0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.178 | 0.478 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -1.242 | 1.138 | 0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.004 | -0.213 | 0.196 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.702 | 0.349 | 0.904 | 0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.039 | -0.646 | 0.384 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.872 | 0.750 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.049 | -0.353 | 0.135 | 0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.562 | 0.580 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.238 | 0.457 | 0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.828 | 0.685 | 0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | 0.031 | -0.297 | 0.419 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 0.984 | 0.163 | 1.398 | 0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.167 | -1.609 | 0.367 | 0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.343 | -4.484 | 2.362 | 0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | 0.000 | -1.586 | 1.649 | 0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.220 | 0.240 | 0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.378 | 0.514 | 0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | -0.009 | -0.143 | 0.172 | 0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -0.639 | 0.582 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.173 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.733 | 0.277 | 0.903 | 0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.038 | -0.861 | 0.359 | 0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.787 | 0.679 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | -0.029 | -0.365 | 0.143 | 0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.574 | 0.539 | 0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.283 | 0.254 | 0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.998 | 0.522 | 0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.030 | -0.169 | 0.293 | 0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.035 | 0.143 | 1.397 | 0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.161 | -1.413 | 0.084 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.441 | -4.685 | 3.306 | 0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.590 | 1.329 | 0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.266 | 0.232 | 0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.366 | 0.372 | 0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.011 | -0.225 | 0.171 | 0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.660 | 0.801 | 0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.139 | 0.200 | 0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.724 | 0.190 | 0.911 | 0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.038 | -0.981 | 0.285 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.611 | 0.598 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | -0.035 | -0.299 | 0.221 | 0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.502 | 0.520 | 0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.271 | 0.215 | 0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.558 | 0.898 | 0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.424 | 0.190 | 0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.085 | 0.169 | 1.400 | 0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.086 | -1.613 | 0.150 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.541 | -3.902 | 3.728 | 0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.001 | -1.879 | 1.832 | 0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.001 | -0.391 | 0.444 | 0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.407 | 0.448 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | -0.013 | -0.302 | 0.342 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.830 | 0.863 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.117 | 0.094 | 0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.704 | 0.195 | 0.870 | 0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.031 | -1.069 | 0.276 | 0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.656 | 0.555 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | -0.029 | -0.387 | 0.256 | 0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | 0.001 | -0.590 | 0.624 | 0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | -0.011 | -0.277 | 0.303 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -1.124 | 0.539 | 0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | -0.006 | -0.718 | 0.133 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.037 | 0.176 | 1.327 | 0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.112 | -1.591 | 0.177 | 0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.438 | -2.229 | 2.797 | 0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | -0.000 | -2.212 | 1.826 | 0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.343 | 0.338 | 0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.367 | 0.451 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | -0.022 | -0.358 | 0.242 | 0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.922 | 0.886 | 0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.002 | -0.083 | 0.089 | 0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.662 | 0.277 | 0.831 | 0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.025 | -0.959 | 0.261 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.636 | 0.739 | 0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.030 | -0.419 | 0.517 | 0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.615 | 0.709 | 0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | 0.002 | -0.230 | 0.457 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.001 | -1.724 | 1.186 | 0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.019 | -1.909 | 0.255 | 0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.242 | 0.244 | 0.057 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.004 | -0.221 | 0.224 | 0.083 | torch.Size([120]) || stage1.linear1.bias + | 0.737 | 0.334 | 1.046 | 0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.013 | -0.911 | 0.763 | 0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.052 | -2.462 | 2.040 | 0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.785 | 0.767 | 0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.009 | -0.466 | 0.552 | 0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.431 | 0.475 | 0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.796 | 0.497 | 0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.573 | 0.409 | 0.935 | 0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.015 | -0.828 | 0.839 | 0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.604 | 0.542 | 0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.037 | -0.179 | 0.273 | 0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.666 | 0.553 | 0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.001 | -0.416 | 0.396 | 0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.001 | -0.654 | 0.538 | 0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | -0.002 | -0.470 | 0.310 | 0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.951 | 0.342 | 1.189 | 0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.010 | -0.697 | 0.802 | 0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.098 | -2.648 | 2.410 | 0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.733 | 0.886 | 0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.468 | 0.550 | 0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.435 | 0.377 | 0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.359 | 0.258 | 0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.582 | 0.305 | 0.717 | 0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.008 | -0.714 | 0.833 | 0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.732 | 0.501 | 0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.004 | -0.306 | 0.267 | 0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.510 | 0.533 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.315 | 0.291 | 0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.736 | 0.789 | 0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -1.274 | 1.328 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.390 | 0.303 | 0.069 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.010 | -0.219 | 0.227 | 0.087 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.095 | 0.106 | 0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.036 | 0.036 | 0.013 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.136 | 0.141 | 0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.002 | -0.028 | 0.024 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.156 | 0.104 | 0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.008 | -0.055 | 0.045 | 0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.098 | 0.106 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | -0.000 | -0.081 | 0.070 | 0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | -0.000 | -0.375 | 0.279 | 0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | -0.003 | -0.074 | 0.070 | 0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | -0.000 | -0.776 | 0.733 | 0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.021 | -0.239 | 0.513 | 0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.100 | 1.143 | 0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.008 | -0.405 | 0.393 | 0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.963 | 0.899 | 0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | -0.055 | -0.616 | 0.599 | 0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.149 | 0.345 | 1.921 | 0.289 | torch.Size([480]) || stage2.reshape.1.weight + | 0.017 | -0.502 | 0.663 | 0.141 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.609 | 0.736 | 0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight + | 0.006 | -0.136 | 0.404 | 0.077 | torch.Size([120]) || stage2.reshape.2.bias + | 0.686 | 0.172 | 1.113 | 0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.154 | -0.926 | 0.339 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.120 | -1.869 | 4.616 | 0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.514 | 0.499 | 0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.214 | 0.177 | 0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.499 | 0.529 | 0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.004 | -0.171 | 0.556 | 0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.642 | 0.598 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.125 | 0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.592 | 0.325 | 0.794 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.008 | -0.649 | 0.445 | 0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.485 | 0.457 | 0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.053 | -0.240 | 0.171 | 0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.503 | 0.462 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.177 | 0.268 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.690 | 0.498 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.007 | -0.270 | 0.472 | 0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.864 | 0.187 | 1.221 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.146 | -1.128 | 0.299 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.241 | -1.607 | 8.958 | 0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.561 | 0.538 | 0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.198 | 0.222 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | 0.001 | -0.475 | 0.479 | 0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.006 | -0.295 | 0.341 | 0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -0.961 | 0.789 | 0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.105 | 0.143 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.653 | 0.401 | 0.810 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.009 | -0.767 | 0.367 | 0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.499 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.056 | -0.185 | 0.147 | 0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.548 | 0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.002 | -0.231 | 0.177 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.001 | -0.578 | 0.609 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.350 | 0.216 | 0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.848 | 0.172 | 1.107 | 0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.168 | -1.123 | 0.330 | 0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.074 | -1.239 | 4.293 | 0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | -0.001 | -0.643 | 0.531 | 0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.220 | 0.376 | 0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.529 | 0.479 | 0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.230 | 0.295 | 0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.726 | 0.768 | 0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.167 | 0.193 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.695 | 0.334 | 0.833 | 0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.012 | -0.755 | 0.517 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.474 | 0.480 | 0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.049 | -0.218 | 0.148 | 0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.529 | 0.542 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.006 | -0.245 | 0.239 | 0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.541 | 0.485 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.318 | 0.170 | 0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.903 | 0.178 | 1.124 | 0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.138 | -1.223 | 0.440 | 0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.164 | -1.383 | 5.910 | 0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.526 | 0.496 | 0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.250 | 0.273 | 0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.447 | 0.524 | 0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.243 | 0.256 | 0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.551 | 0.730 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.145 | 0.126 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.707 | 0.319 | 0.855 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.013 | -0.839 | 0.507 | 0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.509 | 0.508 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.051 | -0.219 | 0.155 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.475 | 0.592 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.162 | 0.220 | 0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.465 | 0.528 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.243 | 0.286 | 0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.948 | 0.220 | 1.175 | 0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.125 | -1.093 | 0.385 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.150 | -1.632 | 4.522 | 0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.636 | 0.543 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.254 | 0.262 | 0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.632 | 0.628 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | -0.005 | -0.240 | 0.330 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.476 | 0.479 | 0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.112 | 0.134 | 0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.686 | 0.264 | 0.797 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.012 | -0.889 | 0.427 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.476 | 0.478 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.051 | -0.267 | 0.180 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.506 | 0.517 | 0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.172 | 0.241 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.570 | 0.542 | 0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.003 | -0.631 | 0.395 | 0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.912 | 0.189 | 1.122 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.114 | -1.125 | 0.188 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.099 | -1.285 | 1.708 | 0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.496 | 0.540 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | 0.003 | -0.260 | 0.228 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.511 | 0.454 | 0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.711 | 0.286 | 0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.444 | 0.454 | 0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.101 | 0.133 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.668 | 0.312 | 0.800 | 0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.015 | -0.778 | 0.372 | 0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.485 | 0.469 | 0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.045 | -0.294 | 0.173 | 0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.554 | 0.540 | 0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.183 | 0.199 | 0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.879 | 0.824 | 0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.670 | 0.358 | 0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.253 | 0.346 | 0.068 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.007 | -0.248 | 0.241 | 0.103 | torch.Size([120]) || stage2.linear1.bias + | 1.012 | 0.613 | 1.327 | 0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.019 | -0.724 | 0.685 | 0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.003 | -2.959 | 1.705 | 0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.636 | 0.617 | 0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | -0.002 | -0.291 | 0.292 | 0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | -0.002 | -0.476 | 0.512 | 0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.263 | 0.398 | 0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.677 | 0.521 | 0.840 | 0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.010 | -0.710 | 0.541 | 0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.540 | 0.507 | 0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | -0.016 | -0.242 | 0.201 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.519 | 0.479 | 0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.162 | 0.231 | 0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.449 | 0.494 | 0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.293 | 0.222 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.053 | 0.832 | 1.269 | 0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.015 | -0.549 | 0.428 | 0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.007 | -3.099 | 1.550 | 0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.673 | 0.604 | 0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.416 | 0.391 | 0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.569 | 0.560 | 0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | 0.004 | -0.613 | 0.428 | 0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.762 | 0.464 | 0.954 | 0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.005 | -0.745 | 0.381 | 0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.441 | 0.448 | 0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.019 | -0.292 | 0.460 | 0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.491 | 0.490 | 0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.285 | 0.177 | 0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.535 | 0.631 | 0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.011 | -0.765 | 0.337 | 0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.367 | 0.372 | 0.074 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.009 | -0.288 | 0.342 | 0.130 | torch.Size([120]) || stage2.linear2.bias + | 0.000 | -0.112 | 0.093 | 0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | -0.002 | -0.036 | 0.035 | 0.016 | torch.Size([120]) || stage2.pa_deform.bias + | 0.000 | -0.068 | 0.080 | 0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.009 | -0.035 | 0.023 | 0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | 0.000 | -0.068 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.014 | -0.061 | 0.036 | 0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.082 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.003 | -0.075 | 0.069 | 0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | -0.000 | -0.166 | 0.139 | 0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | -0.015 | -0.090 | 0.050 | 0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.002 | -0.642 | 0.663 | 0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.130 | -0.171 | 0.480 | 0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.696 | 0.620 | 0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.007 | -0.337 | 0.301 | 0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.650 | 0.657 | 0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.013 | -0.507 | 0.451 | 0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.067 | 0.372 | 1.778 | 0.269 | torch.Size([480]) || stage3.reshape.1.weight + | -0.004 | -0.699 | 0.521 | 0.227 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.643 | 0.743 | 0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.009 | -0.176 | 0.243 | 0.079 | torch.Size([120]) || stage3.reshape.2.bias + | 0.785 | 0.469 | 1.029 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.102 | -0.716 | 0.311 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.340 | 0.163 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.328 | 0.302 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | 0.004 | -0.232 | 0.189 | 0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.343 | 0.346 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | 0.004 | -0.335 | 0.229 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.366 | 0.325 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.074 | 0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.751 | 0.517 | 0.928 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.002 | -0.271 | 0.189 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.371 | 0.388 | 0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.073 | -0.203 | 0.039 | 0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.400 | 0.401 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.178 | 0.128 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.410 | 0.429 | 0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.006 | -0.345 | 0.304 | 0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.816 | 0.469 | 1.015 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.103 | -0.647 | 0.225 | 0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.464 | 0.239 | 0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.304 | 0.359 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.173 | 0.193 | 0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.299 | 0.408 | 0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.007 | -0.511 | 0.239 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.288 | 0.254 | 0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.060 | 0.054 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.796 | 0.609 | 0.971 | 0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | -0.002 | -0.327 | 0.247 | 0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.379 | 0.407 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.077 | -0.214 | 0.034 | 0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.391 | 0.432 | 0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.176 | 0.112 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.378 | 0.399 | 0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | 0.009 | -0.410 | 0.306 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.854 | 0.447 | 0.995 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.086 | -0.513 | 0.198 | 0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.001 | -0.189 | 0.292 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.390 | 0.367 | 0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.310 | 0.284 | 0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.334 | 0.296 | 0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.356 | 0.299 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.276 | 0.315 | 0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.094 | 0.066 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.829 | 0.673 | 1.017 | 0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.259 | 0.228 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.410 | 0.385 | 0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.085 | -0.200 | 0.017 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.348 | 0.378 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.130 | 0.105 | 0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.346 | 0.425 | 0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | 0.005 | -0.363 | 0.241 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.872 | 0.554 | 1.068 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.057 | -0.402 | 0.133 | 0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | 0.003 | -0.365 | 0.217 | 0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.359 | 0.357 | 0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.265 | 0.294 | 0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.300 | 0.271 | 0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.316 | 0.215 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.370 | 0.329 | 0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.056 | 0.066 | 0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.842 | 0.631 | 0.989 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | -0.001 | -0.216 | 0.263 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.388 | 0.391 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.087 | -0.202 | 0.032 | 0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.364 | 0.428 | 0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.137 | 0.106 | 0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.390 | 0.339 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.376 | 0.203 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.913 | 0.498 | 1.102 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.048 | -0.340 | 0.105 | 0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | 0.001 | -0.706 | 0.306 | 0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.373 | 0.339 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.301 | 0.301 | 0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.278 | 0.277 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | 0.003 | -0.310 | 0.240 | 0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.350 | 0.322 | 0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.064 | 0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.862 | 0.679 | 0.990 | 0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | -0.004 | -0.313 | 0.190 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.370 | 0.364 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.092 | -0.231 | 0.129 | 0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.375 | 0.511 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.114 | 0.114 | 0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.389 | 0.354 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | 0.005 | -0.258 | 0.164 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.899 | 0.480 | 1.089 | 0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.030 | -0.257 | 0.115 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | 0.003 | -0.462 | 0.290 | 0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.391 | 0.365 | 0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | -0.004 | -0.232 | 0.302 | 0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.267 | 0.293 | 0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.250 | 0.182 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.238 | 0.257 | 0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.032 | 0.033 | 0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.864 | 0.651 | 1.029 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.003 | -0.212 | 0.175 | 0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.378 | 0.379 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.097 | -0.308 | 0.026 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.578 | 0.401 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.166 | 0.131 | 0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.358 | 0.376 | 0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.262 | 0.176 | 0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.284 | 0.467 | 0.071 | torch.Size([120, 120]) || stage3.linear1.weight + | 0.006 | -0.201 | 0.269 | 0.090 | torch.Size([120]) || stage3.linear1.bias + | 0.877 | 0.568 | 1.197 | 0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.248 | 0.324 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.261 | 0.125 | 0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.563 | 0.552 | 0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.257 | 0.302 | 0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.390 | 0.385 | 0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.450 | 0.235 | 0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.986 | 0.755 | 1.165 | 0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.260 | 0.169 | 0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.355 | 0.397 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.046 | -0.220 | 0.086 | 0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.424 | 0.368 | 0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.111 | 0.122 | 0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.354 | 0.374 | 0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | 0.001 | -0.374 | 0.272 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.919 | 0.643 | 1.132 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.177 | 0.181 | 0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.332 | 0.131 | 0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.418 | 0.362 | 0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.375 | 0.347 | 0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | -0.001 | -0.294 | 0.354 | 0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | 0.003 | -0.432 | 0.259 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.012 | 0.750 | 1.178 | 0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.001 | -0.171 | 0.155 | 0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.331 | 0.356 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.035 | -0.207 | 0.197 | 0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.399 | 0.398 | 0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.111 | 0.129 | 0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.001 | -0.353 | 0.330 | 0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.328 | 0.127 | 0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.289 | 0.519 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.318 | 0.371 | 0.144 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.086 | 0.095 | 0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | -0.002 | -0.023 | 0.021 | 0.010 | torch.Size([120]) || stage3.pa_deform.bias + | -0.000 | -0.060 | 0.056 | 0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.008 | -0.035 | 0.019 | 0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.001 | -0.064 | 0.062 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.007 | -0.044 | 0.031 | 0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | 0.000 | -0.062 | 0.063 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | -0.006 | -0.052 | 0.043 | 0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | -0.081 | 0.080 | 0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | -0.004 | -0.087 | 0.083 | 0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | -0.002 | -0.465 | 0.513 | 0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.059 | -0.251 | 0.595 | 0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | -0.000 | -0.544 | 0.531 | 0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.001 | -0.589 | 0.433 | 0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | -0.000 | -0.535 | 0.562 | 0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.001 | -0.401 | 0.342 | 0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 0.997 | 0.921 | 1.125 | 0.028 | torch.Size([480]) || stage4.reshape.1.weight + | -0.000 | -0.058 | 0.059 | 0.022 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.155 | 0.150 | 0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight + | 0.001 | -0.016 | 0.016 | 0.006 | torch.Size([120]) || stage4.reshape.2.bias + | 1.002 | 0.999 | 1.009 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.071 | 0.066 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.093 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.009 | 0.009 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.080 | 0.097 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.035 | 0.027 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.080 | 0.079 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.087 | 0.092 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.080 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.031 | 0.029 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.002 | 0.997 | 1.007 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.066 | 0.065 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.006 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.080 | 0.083 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | -0.000 | -0.027 | 0.029 | 0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.077 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.006 | 0.009 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.080 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.077 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.084 | 0.075 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.034 | 0.031 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.996 | 1.008 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.001 | -0.070 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.007 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.080 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | -0.000 | -0.023 | 0.026 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.107 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.000 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.076 | 0.077 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.005 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -2.000 | 0.081 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.084 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.027 | 0.024 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.999 | 1.012 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.064 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.099 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.083 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | -0.000 | -0.019 | 0.018 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.079 | 0.084 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.087 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.079 | 0.082 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.002 | 0.998 | 1.011 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.001 | -0.004 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.089 | 0.081 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.080 | 0.085 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.075 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.000 | -0.021 | 0.016 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.082 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.086 | 0.080 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.076 | 0.081 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.018 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.003 | 0.997 | 1.014 | 0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.001 | -0.005 | 0.004 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.070 | 0.069 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.097 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.075 | 0.089 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.016 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.083 | 0.091 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.093 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.000 | -0.002 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.086 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.079 | 0.092 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | -0.000 | -0.012 | 0.016 | 0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.090 | 0.111 | 0.024 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.001 | -0.019 | 0.029 | 0.009 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 0.999 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.084 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.079 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | 0.000 | -0.021 | 0.024 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.072 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.077 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.102 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.024 | 0.020 | 0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.001 | 0.998 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.078 | 0.096 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.005 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.020 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.085 | 0.082 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.000 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.092 | 0.112 | 0.023 | torch.Size([120, 120]) || stage4.linear2.weight + | 0.000 | -0.032 | 0.049 | 0.015 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.036 | 0.037 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.022 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.002 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | -0.003 | 0.002 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | 0.000 | -0.172 | 0.177 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.002 | -0.027 | 0.088 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.212 | 0.163 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | 0.000 | -0.066 | 0.081 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | 0.000 | -0.413 | 0.387 | 0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.001 | -0.198 | 0.214 | 0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.979 | 0.896 | 1.076 | 0.053 | torch.Size([30]) || stage5.reshape.1.weight + | -0.005 | -0.074 | 0.100 | 0.043 | torch.Size([30]) || stage5.reshape.1.bias + | 0.000 | -0.240 | 0.249 | 0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight + | -0.002 | -0.286 | 0.229 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 1.001 | 0.993 | 1.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.004 | -0.018 | 0.006 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.066 | 0.062 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.086 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.014 | 0.012 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.166 | 0.172 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.053 | 0.045 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.999 | 0.987 | 1.001 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.094 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.022 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.082 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.014 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.075 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.073 | 0.078 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.001 | 0.994 | 1.007 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.004 | -0.016 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.065 | 0.063 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.077 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.022 | 0.017 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.113 | 0.098 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.058 | 0.045 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.080 | 0.080 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.999 | 0.982 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.006 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.076 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.017 | 0.014 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.080 | 0.086 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.014 | 0.016 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.096 | 0.079 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.051 | 0.039 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.998 | 1.009 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.004 | -0.014 | 0.003 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.067 | 0.073 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.085 | 0.087 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.015 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.108 | 0.095 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.043 | 0.039 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.999 | 0.978 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.076 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.012 | 0.019 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.079 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | -0.001 | -0.014 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.076 | 0.082 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.000 | -0.047 | 0.043 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.978 | 1.015 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.004 | -0.013 | 0.004 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.084 | 0.070 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.078 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.014 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.123 | 0.132 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.001 | -0.028 | 0.044 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.082 | 0.089 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.999 | 0.974 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.008 | 0.010 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.088 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.000 | -0.014 | 0.019 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.081 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.020 | 0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.081 | 0.106 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.046 | 0.042 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.944 | 1.017 | 0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.005 | -0.015 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.071 | 0.067 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.085 | 0.090 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.021 | 0.013 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.130 | 0.089 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.036 | 0.024 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.086 | 0.076 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.999 | 0.967 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.006 | 0.007 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.080 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | -0.001 | -0.015 | 0.010 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.081 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.020 | 0.018 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.081 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.037 | 0.050 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.004 | 0.976 | 1.039 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.005 | -0.015 | 0.005 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.070 | 0.076 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.099 | 0.097 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.011 | 0.012 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.084 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.038 | 0.035 | 0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.087 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.998 | 0.960 | 1.002 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.088 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | -0.000 | -0.014 | 0.027 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.081 | 0.074 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.013 | 0.025 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.100 | 0.086 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.000 | -0.022 | 0.030 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.102 | 0.117 | 0.023 | torch.Size([120, 120]) || stage5.linear1.weight + | -0.003 | -0.297 | 0.242 | 0.084 | torch.Size([120]) || stage5.linear1.bias + | 0.999 | 0.971 | 1.008 | 0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.000 | -0.035 | 0.034 | 0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.079 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.087 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.028 | 0.018 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.079 | 0.082 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.146 | 0.171 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.997 | 0.967 | 1.003 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.073 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.002 | -0.017 | 0.008 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.084 | 0.073 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.011 | 0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.103 | 0.140 | 0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.999 | 0.986 | 1.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.035 | 0.034 | 0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.087 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.084 | 0.079 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.024 | 0.024 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.078 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.112 | 0.144 | 0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.998 | 0.965 | 1.004 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.001 | -0.012 | 0.015 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.102 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.012 | 0.009 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.075 | 0.078 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.105 | 0.131 | 0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.220 | 0.209 | 0.035 | torch.Size([120, 120]) || stage5.linear2.weight + | -0.003 | -0.335 | 0.284 | 0.096 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.064 | 0.065 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.001 | -0.050 | 0.050 | 0.029 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.119 | 0.106 | 0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.006 | -0.030 | 0.026 | 0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.055 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | 0.001 | -0.033 | 0.031 | 0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | 0.001 | -0.060 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.040 | 0.037 | 0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.001 | -0.038 | 0.051 | 0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | -0.048 | 0.050 | 0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.334 | 0.340 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.037 | -0.050 | 0.294 | 0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | -0.000 | -0.343 | 0.349 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | -0.001 | -0.237 | 0.244 | 0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | -0.000 | -0.575 | 0.591 | 0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.001 | -0.404 | 0.344 | 0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.254 | 1.058 | 1.466 | 0.126 | torch.Size([30]) || stage6.reshape.1.weight + | -0.001 | -0.074 | 0.093 | 0.041 | torch.Size([30]) || stage6.reshape.1.bias + | 0.000 | -0.734 | 0.625 | 0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.003 | -0.269 | 0.341 | 0.108 | torch.Size([120]) || stage6.reshape.2.bias + | 0.815 | 0.495 | 1.118 | 0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.071 | -0.291 | 0.263 | 0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.080 | 0.087 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.136 | 0.134 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.061 | 0.037 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.201 | 0.182 | 0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.223 | 0.189 | 0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.184 | 0.211 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.049 | 0.069 | 0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.710 | 0.556 | 0.893 | 0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.172 | 0.193 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.217 | 0.211 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.041 | -0.158 | 0.025 | 0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.209 | 0.178 | 0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.141 | 0.186 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.245 | 0.347 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.005 | -0.161 | 0.188 | 0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.780 | 0.582 | 0.963 | 0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.112 | -0.302 | 0.103 | 0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.101 | 0.072 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.112 | 0.178 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.034 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.223 | 0.242 | 0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.149 | 0.105 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.199 | 0.173 | 0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.035 | 0.056 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.744 | 0.530 | 0.917 | 0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.131 | 0.180 | 0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.243 | 0.294 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.039 | -0.217 | 0.045 | 0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.206 | 0.178 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.129 | 0.125 | 0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.236 | 0.276 | 0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.158 | 0.170 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.829 | 0.586 | 1.007 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.101 | -0.353 | 0.132 | 0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.082 | 0.076 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.154 | 0.143 | 0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.041 | 0.038 | 0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.187 | 0.202 | 0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.096 | 0.127 | 0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.203 | 0.185 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.768 | 0.491 | 0.904 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.001 | -0.146 | 0.159 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.184 | 0.204 | 0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.043 | -0.185 | 0.020 | 0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.188 | 0.270 | 0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.152 | 0.134 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.222 | 0.217 | 0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.141 | 0.144 | 0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.820 | 0.554 | 0.976 | 0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.091 | -0.336 | 0.137 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.124 | 0.222 | 0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.157 | 0.175 | 0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.049 | 0.049 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.238 | 0.236 | 0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.077 | 0.074 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.212 | 0.265 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.028 | 0.052 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.768 | 0.530 | 0.903 | 0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.104 | 0.157 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.197 | 0.220 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.042 | -0.155 | 0.043 | 0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.166 | 0.199 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.102 | 0.138 | 0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.241 | 0.256 | 0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.123 | 0.115 | 0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.817 | 0.631 | 0.918 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.295 | 0.141 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.084 | 0.205 | 0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.174 | 0.199 | 0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.060 | 0.081 | 0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.194 | 0.191 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.001 | -0.083 | 0.077 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.218 | 0.243 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.031 | 0.024 | 0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.744 | 0.478 | 0.913 | 0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.003 | -0.146 | 0.110 | 0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.223 | 0.238 | 0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.046 | -0.200 | 0.071 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.168 | 0.201 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.128 | 0.141 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.220 | 0.205 | 0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.086 | 0.094 | 0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.754 | 0.353 | 0.933 | 0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.246 | 0.105 | 0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.113 | 0.536 | 0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.261 | 0.224 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.050 | 0.067 | 0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.234 | 0.256 | 0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | 0.002 | -0.079 | 0.076 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.211 | 0.231 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.033 | 0.030 | 0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.677 | 0.275 | 0.833 | 0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.001 | -0.224 | 0.306 | 0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.196 | 0.211 | 0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.061 | -0.289 | 0.136 | 0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.271 | 0.312 | 0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.166 | 0.155 | 0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.286 | 0.375 | 0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.054 | 0.137 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.174 | 0.172 | 0.039 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.002 | -0.275 | 0.348 | 0.113 | torch.Size([120]) || stage6.linear1.bias + | 0.704 | 0.402 | 1.002 | 0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.001 | -0.466 | 0.407 | 0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.172 | 0.570 | 0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.337 | 0.378 | 0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.071 | 0.068 | 0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.290 | 0.321 | 0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | 0.001 | -0.255 | 0.250 | 0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.695 | 0.353 | 0.966 | 0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | -0.001 | -0.218 | 0.165 | 0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.259 | 0.255 | 0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.044 | -0.256 | 0.042 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.234 | 0.214 | 0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.002 | -0.133 | 0.091 | 0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.333 | 0.296 | 0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | 0.003 | -0.238 | 0.280 | 0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.671 | 0.425 | 0.980 | 0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.261 | 0.305 | 0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.372 | 0.942 | 0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.450 | 0.494 | 0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.133 | 0.119 | 0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.239 | 0.288 | 0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.187 | 0.157 | 0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.687 | 0.160 | 0.907 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | -0.002 | -0.192 | 0.222 | 0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.257 | 0.426 | 0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.064 | -0.207 | 0.036 | 0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.269 | 0.224 | 0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.126 | 0.129 | 0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.308 | 0.298 | 0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.180 | 0.192 | 0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.297 | 0.368 | 0.069 | torch.Size([120, 120]) || stage6.linear2.weight + | 0.001 | -0.431 | 0.480 | 0.189 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.100 | 0.104 | 0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.001 | -0.018 | 0.029 | 0.010 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.105 | 0.111 | 0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.033 | 0.024 | 0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.071 | 0.067 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.003 | -0.061 | 0.043 | 0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.000 | -0.074 | 0.068 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.001 | -0.075 | 0.056 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.001 | -0.124 | 0.108 | 0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | -0.001 | -0.113 | 0.076 | 0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.001 | -0.517 | 0.524 | 0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.154 | -0.305 | 0.679 | 0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | 0.000 | -0.680 | 0.728 | 0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.020 | -0.514 | 0.417 | 0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.587 | 0.737 | 0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.015 | -0.437 | 0.490 | 0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.284 | 1.119 | 1.404 | 0.055 | torch.Size([30]) || stage7.reshape.1.weight + | -0.014 | -0.286 | 0.184 | 0.122 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.521 | 0.576 | 0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight + | 0.004 | -0.387 | 0.738 | 0.175 | torch.Size([120]) || stage7.reshape.2.bias + | 0.440 | 0.099 | 0.775 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.177 | -0.670 | 0.319 | 0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.055 | -2.159 | 1.979 | 0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.535 | 0.554 | 0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.193 | 0.281 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.397 | 0.395 | 0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.232 | 0.692 | 0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.899 | 1.073 | 0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.122 | 0.104 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.310 | 0.157 | 0.440 | 0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.006 | -0.474 | 0.266 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.605 | 0.490 | 0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.101 | -0.310 | 0.126 | 0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.448 | 0.475 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | 0.006 | -0.185 | 0.215 | 0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.465 | 0.512 | 0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.150 | 0.417 | 0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.577 | 0.165 | 0.829 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.136 | -0.849 | 0.206 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.143 | -3.020 | 4.621 | 0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.647 | 0.640 | 0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.356 | 0.382 | 0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.457 | 0.378 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.250 | 0.707 | 0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.055 | 1.091 | 0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.093 | 0.123 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.411 | 0.265 | 0.535 | 0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.008 | -0.630 | 0.264 | 0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.501 | 0.506 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.087 | -0.341 | 0.140 | 0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.450 | 0.527 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.188 | 0.171 | 0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.554 | 0.546 | 0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.000 | -0.135 | 0.220 | 0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.655 | 0.134 | 0.896 | 0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.139 | -0.788 | 0.181 | 0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.062 | -3.469 | 3.276 | 0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.592 | 0.650 | 0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.308 | 0.218 | 0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.355 | 0.345 | 0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.213 | 0.700 | 0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -1.166 | 0.942 | 0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.093 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.466 | 0.317 | 0.565 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.014 | -0.657 | 0.280 | 0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.541 | 0.494 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.079 | -0.335 | 0.122 | 0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.513 | 0.493 | 0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.180 | 0.175 | 0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.509 | 0.479 | 0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.004 | -0.093 | 0.293 | 0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.693 | 0.147 | 0.945 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.132 | -0.906 | 0.249 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.108 | -3.576 | 4.241 | 0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.945 | 1.095 | 0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.274 | 0.204 | 0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | -0.001 | -0.379 | 0.351 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.211 | 0.587 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -1.269 | 1.067 | 0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.117 | 0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.499 | 0.285 | 0.570 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.012 | -0.567 | 0.273 | 0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.528 | 0.499 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.084 | -0.349 | 0.141 | 0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.547 | 0.592 | 0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | 0.002 | -0.154 | 0.176 | 0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.520 | 0.480 | 0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.001 | -0.150 | 0.207 | 0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.726 | 0.137 | 1.004 | 0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.122 | -0.907 | 0.180 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.078 | -3.824 | 4.241 | 0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -1.188 | 0.796 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.248 | 0.207 | 0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | -0.001 | -0.409 | 0.369 | 0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.224 | 0.322 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -1.744 | 1.273 | 0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.113 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.514 | 0.277 | 0.614 | 0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.016 | -0.621 | 0.286 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.517 | 0.453 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.064 | -0.260 | 0.143 | 0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.503 | 0.554 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.004 | -0.232 | 0.193 | 0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.595 | 0.543 | 0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.196 | 0.198 | 0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.731 | 0.152 | 1.075 | 0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.076 | -1.003 | 0.176 | 0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.121 | -3.281 | 4.671 | 0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.640 | 1.083 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.239 | 0.314 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.344 | 0.452 | 0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.361 | 0.251 | 0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.637 | 0.806 | 0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.088 | 0.091 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.514 | 0.238 | 0.594 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.017 | -0.650 | 0.162 | 0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.442 | 0.479 | 0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.040 | -0.400 | 0.203 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.541 | 0.514 | 0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | -0.008 | -0.319 | 0.309 | 0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -1.018 | 1.398 | 0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.606 | 0.269 | 0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.186 | 0.207 | 0.048 | torch.Size([120, 120]) || stage7.linear1.weight + | 0.010 | -0.448 | 0.437 | 0.161 | torch.Size([120]) || stage7.linear1.bias + | 0.703 | 0.381 | 0.856 | 0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.014 | -0.645 | 0.486 | 0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.007 | -4.468 | 1.008 | 0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.625 | 0.834 | 0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | -0.009 | -0.737 | 0.632 | 0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.403 | 0.406 | 0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.338 | 0.165 | 0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.435 | 0.323 | 0.526 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.678 | 0.379 | 0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.465 | 0.467 | 0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | -0.031 | -0.236 | 0.180 | 0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.490 | 0.520 | 0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.197 | 0.242 | 0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.525 | 0.501 | 0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.005 | -0.431 | 0.164 | 0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.703 | 0.306 | 0.866 | 0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.009 | -0.647 | 0.481 | 0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.010 | -3.504 | 1.842 | 0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.639 | 0.590 | 0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.613 | 0.609 | 0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.316 | 0.325 | 0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | -0.004 | -0.350 | 0.145 | 0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.452 | 0.309 | 0.558 | 0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.003 | -0.661 | 0.246 | 0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.580 | 0.410 | 0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | -0.020 | -0.258 | 0.299 | 0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.561 | 0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.234 | 0.434 | 0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.778 | 0.581 | 0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.888 | 0.286 | 0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.348 | 0.237 | 0.060 | torch.Size([120, 120]) || stage7.linear2.weight + | 0.023 | -0.390 | 0.506 | 0.167 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.104 | 0.107 | 0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.002 | -0.041 | 0.035 | 0.016 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.123 | 0.109 | 0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.002 | -0.034 | 0.032 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.111 | 0.084 | 0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.008 | -0.073 | 0.081 | 0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.154 | 0.122 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.014 | -0.041 | 0.068 | 0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | -0.001 | -0.408 | 0.365 | 0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | -0.003 | -0.057 | 0.054 | 0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | 0.000 | -0.697 | 0.606 | 0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.119 | -0.211 | 0.720 | 0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.000 | -1.175 | 0.924 | 0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.000 | -0.581 | 0.580 | 0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.001 | -0.786 | 0.874 | 0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | -0.053 | -0.522 | 0.577 | 0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.225 | 1.000 | 1.516 | 0.095 | torch.Size([120]) || stage8.0.1.weight + | -0.013 | -0.413 | 0.465 | 0.139 | torch.Size([120]) || stage8.0.1.bias + | 0.000 | -2.505 | 0.627 | 0.136 | torch.Size([180, 120]) || stage8.0.2.weight + | 0.005 | -0.397 | 0.377 | 0.107 | torch.Size([180]) || stage8.0.2.bias + | 0.456 | 0.123 | 0.760 | 0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.022 | -0.343 | 0.875 | 0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.014 | -1.907 | 2.592 | 0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.632 | 0.628 | 0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | 0.006 | -0.567 | 0.668 | 0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.477 | 0.447 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | -0.010 | -0.460 | 0.225 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.429 | 0.119 | 0.634 | 0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.007 | -0.338 | 0.803 | 0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | -0.006 | -0.572 | 0.539 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.060 | -0.260 | 0.185 | 0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.461 | 0.548 | 0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.163 | 0.183 | 0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.757 | 0.581 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.191 | 0.121 | 0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.557 | 0.086 | 0.800 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.029 | -0.230 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | -0.016 | -2.004 | 1.711 | 0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.690 | 0.575 | 0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | 0.011 | -0.641 | 0.609 | 0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.466 | 0.401 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.344 | 0.181 | 0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.503 | 0.226 | 0.742 | 0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.009 | -0.404 | 0.818 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.007 | -0.595 | 0.532 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.068 | -0.261 | 0.071 | 0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.573 | 0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.129 | 0.197 | 0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.556 | 0.582 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | -0.003 | -0.170 | 0.145 | 0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.699 | 0.202 | 0.912 | 0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.033 | -0.253 | 0.924 | 0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.030 | -2.510 | 2.088 | 0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.637 | 0.801 | 0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.006 | -0.512 | 0.520 | 0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.381 | 0.337 | 0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | -0.011 | -0.238 | 0.234 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.594 | 0.150 | 0.810 | 0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.010 | -0.483 | 0.726 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.006 | -0.567 | 0.499 | 0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.077 | -0.360 | 0.050 | 0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.536 | 0.673 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.142 | 0.186 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.536 | 0.524 | 0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | -0.006 | -0.147 | 0.133 | 0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.683 | 0.141 | 0.908 | 0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.033 | -0.199 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | -0.039 | -1.527 | 3.891 | 0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.682 | 0.693 | 0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | 0.007 | -0.543 | 0.513 | 0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.390 | 0.476 | 0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.007 | -0.176 | 0.150 | 0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.640 | 0.094 | 0.853 | 0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.009 | -0.372 | 0.683 | 0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.006 | -0.628 | 0.521 | 0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.089 | -0.367 | 0.047 | 0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.629 | 0.562 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.186 | 0.128 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.485 | 0.499 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.138 | 0.209 | 0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.294 | 0.577 | 0.071 | torch.Size([180, 180]) || stage8.1.linear.weight + | 0.004 | -0.349 | 0.235 | 0.072 | torch.Size([180]) || stage8.1.linear.bias + | 0.708 | 0.242 | 1.026 | 0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.032 | -0.212 | 0.830 | 0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | -0.039 | -1.954 | 2.394 | 0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.922 | 0.646 | 0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.429 | 0.524 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.467 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | -0.005 | -0.339 | 0.264 | 0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.587 | 0.255 | 0.837 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.011 | -0.285 | 0.721 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.006 | -0.586 | 0.534 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.075 | -0.225 | 0.066 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.493 | 0.532 | 0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.189 | 0.178 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.551 | 0.543 | 0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.010 | -0.154 | 0.142 | 0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.773 | 0.210 | 1.004 | 0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.035 | -0.176 | 0.873 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.027 | -2.407 | 1.736 | 0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.817 | 0.977 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.659 | 0.461 | 0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.484 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.014 | -0.315 | 0.252 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.641 | 0.337 | 0.810 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.011 | -0.177 | 0.806 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | -0.006 | -0.569 | 0.598 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.079 | -0.323 | 0.071 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.512 | 0.577 | 0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.003 | -0.142 | 0.161 | 0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.529 | 0.572 | 0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | -0.010 | -0.178 | 0.159 | 0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.857 | 0.199 | 1.153 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.039 | -0.189 | 0.943 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.042 | -1.962 | 2.773 | 0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.783 | 0.655 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.004 | -0.338 | 0.533 | 0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.497 | 0.461 | 0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.288 | 0.183 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.681 | 0.327 | 0.878 | 0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.012 | -0.178 | 0.773 | 0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.006 | -0.789 | 0.546 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.081 | -0.249 | 0.036 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.526 | 0.555 | 0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.000 | -0.133 | 0.191 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.572 | 0.529 | 0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.164 | 0.147 | 0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.877 | 0.198 | 1.043 | 0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.038 | -0.210 | 0.916 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.094 | -2.974 | 4.987 | 0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.964 | 1.011 | 0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.404 | 0.429 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.501 | 0.489 | 0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | -0.021 | -0.305 | 0.208 | 0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.697 | 0.295 | 0.894 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.015 | -0.241 | 0.712 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | -0.005 | -0.562 | 0.573 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.085 | -0.302 | 0.080 | 0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.734 | 0.573 | 0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.150 | 0.161 | 0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.671 | 0.623 | 0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | -0.023 | -0.252 | 0.317 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.278 | 0.345 | 0.064 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.004 | -0.315 | 0.148 | 0.064 | torch.Size([180]) || stage8.2.linear.bias + | 0.850 | 0.326 | 1.087 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.031 | -0.334 | 0.779 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.012 | -2.917 | 1.476 | 0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.603 | 0.666 | 0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.374 | 0.381 | 0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.577 | 0.605 | 0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | -0.008 | -0.394 | 0.499 | 0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.636 | 0.321 | 0.790 | 0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.013 | -0.294 | 0.774 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.004 | -0.540 | 0.539 | 0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.065 | -0.212 | 0.047 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.608 | 0.603 | 0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.002 | -0.177 | 0.155 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.573 | 0.630 | 0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.005 | -0.189 | 0.178 | 0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.899 | 0.275 | 1.048 | 0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.031 | -0.223 | 0.771 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.003 | -3.151 | 1.718 | 0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.732 | 0.868 | 0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.412 | 0.350 | 0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.466 | 0.487 | 0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | -0.006 | -0.388 | 0.400 | 0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.711 | 0.381 | 0.864 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.009 | -0.240 | 0.692 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.005 | -0.657 | 0.639 | 0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.077 | -0.263 | 0.047 | 0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.673 | 0.605 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.158 | 0.155 | 0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.582 | 0.585 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.009 | -0.253 | 0.178 | 0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 0.941 | 0.262 | 1.154 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.032 | -0.162 | 0.906 | 0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.005 | -3.421 | 1.350 | 0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.777 | 0.735 | 0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.355 | 0.421 | 0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.479 | 0.475 | 0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | -0.013 | -0.292 | 0.345 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.743 | 0.242 | 0.919 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.011 | -0.214 | 0.691 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.005 | -0.633 | 0.498 | 0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.082 | -0.346 | 0.087 | 0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.591 | 0.670 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.190 | 0.151 | 0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.560 | 0.637 | 0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | -0.009 | -0.226 | 0.250 | 0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 0.950 | 0.250 | 1.103 | 0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.035 | -0.196 | 0.925 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | -0.026 | -3.591 | 5.653 | 0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.753 | 0.637 | 0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | 0.000 | -0.333 | 0.432 | 0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | 0.001 | -0.591 | 0.591 | 0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | -0.014 | -0.348 | 0.267 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.735 | 0.254 | 0.893 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.011 | -0.241 | 0.659 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.005 | -0.628 | 0.667 | 0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.076 | -0.411 | 0.113 | 0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.662 | 0.578 | 0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.208 | 0.169 | 0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.602 | 0.588 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | -0.011 | -0.218 | 0.232 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.343 | 0.316 | 0.065 | torch.Size([180, 180]) || stage8.3.linear.weight + | 0.010 | -0.297 | 0.187 | 0.061 | torch.Size([180]) || stage8.3.linear.bias + | 1.012 | 0.330 | 1.282 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.030 | -0.347 | 0.800 | 0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.013 | -2.816 | 3.792 | 0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.807 | 0.825 | 0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.429 | 0.319 | 0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.553 | 0.569 | 0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.019 | -0.443 | 0.441 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.638 | 0.420 | 0.797 | 0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.018 | -0.222 | 0.886 | 0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.576 | 0.510 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.018 | -0.277 | 0.123 | 0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.687 | 0.625 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | -0.007 | -0.264 | 0.267 | 0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | 0.001 | -0.639 | 0.705 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.012 | -0.255 | 0.274 | 0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.092 | 0.475 | 1.341 | 0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.030 | -0.294 | 0.686 | 0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.018 | -3.165 | 0.990 | 0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.695 | 0.699 | 0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.319 | 0.286 | 0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.542 | 0.519 | 0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | -0.017 | -0.439 | 0.451 | 0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.664 | 0.366 | 0.835 | 0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.015 | -0.217 | 0.985 | 0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.641 | 0.563 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.022 | -0.381 | 0.161 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.571 | 0.642 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | 0.003 | -0.279 | 0.311 | 0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.738 | 0.633 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.254 | 0.261 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.125 | 0.525 | 1.405 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.033 | -0.186 | 0.627 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | 0.028 | -3.477 | 0.957 | 0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.663 | 0.658 | 0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.007 | -0.357 | 0.255 | 0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.596 | 0.578 | 0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.018 | -0.506 | 0.389 | 0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.694 | 0.319 | 0.865 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.018 | -0.150 | 0.975 | 0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.002 | -0.619 | 0.565 | 0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.025 | -0.345 | 0.208 | 0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.624 | 0.607 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.388 | 0.290 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.927 | 0.675 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.325 | 0.240 | 0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.108 | 0.535 | 1.297 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.035 | -0.213 | 0.546 | 0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | 0.020 | -3.042 | 1.420 | 0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.697 | 0.700 | 0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | -0.000 | -0.220 | 0.311 | 0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.652 | 0.592 | 0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | -0.019 | -0.535 | 0.426 | 0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.685 | 0.225 | 0.893 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.023 | -0.211 | 0.938 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.501 | 0.564 | 0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | -0.014 | -0.339 | 0.237 | 0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.560 | 0.626 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.000 | -0.231 | 0.239 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.544 | 0.657 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.271 | 0.274 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.473 | 0.481 | 0.069 | torch.Size([180, 180]) || stage8.4.linear.weight + | 0.029 | -0.333 | 0.194 | 0.076 | torch.Size([180]) || stage8.4.linear.bias + | 1.025 | 0.297 | 1.336 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.034 | -0.429 | 0.872 | 0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.574 | -4.515 | 3.381 | 0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.771 | 0.886 | 0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | 0.000 | -0.356 | 0.521 | 0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | -0.001 | -0.632 | 0.656 | 0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.029 | -0.329 | 0.697 | 0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.777 | 0.446 | 0.952 | 0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.022 | -0.335 | 0.920 | 0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.002 | -0.520 | 0.598 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.013 | -0.456 | 0.200 | 0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.677 | 0.642 | 0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.005 | -0.272 | 0.233 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.762 | 0.598 | 0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.025 | -0.244 | 0.583 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.021 | 0.261 | 1.261 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.033 | -0.358 | 0.867 | 0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.550 | -3.274 | 4.406 | 0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.819 | 0.986 | 0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.510 | 0.446 | 0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | -0.003 | -0.739 | 0.682 | 0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.032 | -0.318 | 0.607 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 0.823 | 0.420 | 0.950 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.021 | -0.274 | 0.882 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | -0.002 | -0.496 | 0.532 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | -0.028 | -0.260 | 0.194 | 0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.620 | 0.586 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.284 | 0.423 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.774 | 0.614 | 0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.028 | -0.371 | 0.561 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.096 | 0.377 | 1.321 | 0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.033 | -0.244 | 0.755 | 0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.441 | -3.439 | 5.870 | 0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.710 | 0.679 | 0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | 0.003 | -0.277 | 0.283 | 0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.824 | 0.684 | 0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.033 | -0.390 | 0.545 | 0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 0.843 | 0.390 | 0.984 | 0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.022 | -0.211 | 0.854 | 0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.002 | -0.522 | 0.503 | 0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | -0.024 | -0.243 | 0.219 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | -0.001 | -0.638 | 0.617 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.268 | 0.380 | 0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.713 | 0.769 | 0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.034 | -0.372 | 0.592 | 0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.027 | 0.318 | 1.206 | 0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.033 | -0.187 | 0.768 | 0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.347 | -2.664 | 2.684 | 0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.677 | 0.676 | 0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.002 | -0.410 | 0.354 | 0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.630 | 0.725 | 0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.041 | -0.385 | 0.660 | 0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 0.849 | 0.390 | 0.985 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.023 | -0.163 | 0.810 | 0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | -0.002 | -0.547 | 0.536 | 0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | -0.012 | -0.366 | 0.252 | 0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.669 | 0.597 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.216 | 0.202 | 0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.700 | 0.674 | 0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.032 | -0.376 | 0.666 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.299 | 0.469 | 0.069 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.081 | -0.562 | 0.263 | 0.109 | torch.Size([180]) || stage8.5.linear.bias + | 1.111 | 0.208 | 1.434 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.048 | -0.547 | 0.851 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.252 | -2.157 | 6.293 | 0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.664 | 0.631 | 0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | 0.007 | -0.293 | 0.366 | 0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.701 | 0.726 | 0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | 0.030 | -0.318 | 0.331 | 0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 0.959 | 0.475 | 1.322 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.039 | -0.421 | 0.873 | 0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.002 | -0.550 | 0.783 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.269 | 0.152 | 0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.914 | 0.839 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.340 | 0.304 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.592 | 0.713 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.002 | -0.535 | 0.384 | 0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.123 | 0.183 | 1.352 | 0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.047 | -0.513 | 0.903 | 0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.234 | -1.968 | 6.366 | 0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.751 | 0.759 | 0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.300 | 0.214 | 0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.657 | 0.699 | 0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | 0.031 | -0.321 | 0.293 | 0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 0.986 | 0.416 | 1.360 | 0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.038 | -0.393 | 0.807 | 0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.001 | -0.589 | 0.620 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.005 | -0.316 | 0.229 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.738 | 0.766 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.252 | 0.302 | 0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.674 | 0.629 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.475 | 0.441 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.097 | 0.342 | 1.294 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.054 | -0.639 | 0.904 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.135 | -3.252 | 1.238 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.672 | 0.663 | 0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.007 | -0.170 | 0.228 | 0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.660 | 0.651 | 0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.031 | -0.360 | 0.322 | 0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.004 | 0.360 | 1.381 | 0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.042 | -0.447 | 0.808 | 0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.000 | -0.600 | 0.603 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.022 | -0.447 | 0.249 | 0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.666 | 0.708 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | -0.002 | -0.326 | 0.272 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | -0.001 | -0.653 | 0.719 | 0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.488 | 0.321 | 0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.095 | 0.272 | 1.302 | 0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.052 | -0.557 | 1.069 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.196 | -2.349 | 1.401 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.741 | 0.657 | 0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.186 | 0.141 | 0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.669 | 0.671 | 0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.004 | -0.323 | 0.300 | 0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 0.999 | 0.383 | 1.380 | 0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.044 | -0.392 | 0.694 | 0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | 0.000 | -0.577 | 0.857 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.041 | -0.394 | 0.238 | 0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.924 | 0.828 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.214 | 0.407 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.827 | 0.755 | 0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.022 | -0.296 | 0.262 | 0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | 0.002 | -1.059 | 1.262 | 0.089 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.031 | -0.789 | 0.427 | 0.120 | torch.Size([180]) || stage8.6.linear.bias + | 0.389 | 0.079 | 1.137 | 0.176 | torch.Size([180]) || norm.weight + | -0.021 | -0.669 | 0.888 | 0.127 | torch.Size([180]) || norm.bias + | 0.000 | -0.486 | 0.568 | 0.103 | torch.Size([120, 180]) || conv_after_body.weight + | -0.000 | -0.167 | 0.168 | 0.055 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -1.782 | 1.300 | 0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.019 | -0.542 | 0.437 | 0.162 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.001 | -1.915 | 1.372 | 0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.045 | -0.281 | 0.215 | 0.097 | torch.Size([256]) || upsample.0.bias + | -0.006 | -4.826 | 0.582 | 0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.154 | -0.441 | 0.187 | 0.100 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.210 | 0.246 | 0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.000 | -0.013 | 0.007 | 0.003 | torch.Size([64]) || upsample.10.bias + | 0.000 | -0.044 | 0.042 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:46:12.537 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/train/train_sharp + dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4 + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:46:12.583 : Number of train images: 27,000, iters: 3,375 +22-03-11 10:46:26.822 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:46:27.000 : + | mean | min | max | std || shape + | -0.000 | -1.462 | 1.580 | 0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | 0.005 | -0.950 | 0.885 | 0.268 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.679 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.042 | -0.894 | 0.351 | 0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.008 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.059 | -1.268 | 0.732 | 0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.159 | -0.704 | 0.859 | 0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.780 | -1.061 | 1.162 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.144 | 0.163 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.001 | -0.003 | 0.005 | 0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.726 | 0.773 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.021 | -0.814 | 0.355 | 0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.380 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.038 | -1.207 | 0.714 | 0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.462 | 0.549 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.157 | -0.742 | 0.980 | 0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.648 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.775 | -1.195 | 1.148 | 0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | -0.000 | -0.122 | 0.152 | 0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | -0.000 | -0.002 | 0.001 | 0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.956 | 0.870 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.025 | -1.040 | 0.512 | 0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.195 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.023 | -1.284 | 0.699 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.009 | -1.831 | 0.616 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.120 | -0.695 | 0.755 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.013 | -1.285 | 0.304 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.681 | -1.725 | 0.942 | 0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.045 | 0.071 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.009 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.995 | 0.879 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.040 | -1.137 | 0.617 | 0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.891 | 1.224 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.022 | -1.287 | 0.745 | 0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.802 | 0.561 | 0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.118 | -0.694 | 0.697 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.107 | 0.306 | 0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.658 | -1.792 | 0.905 | 0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.030 | 0.037 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.003 | -0.001 | 0.007 | 0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.990 | 0.880 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.010 | -1.067 | 0.596 | 0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.061 | 1.229 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.024 | -1.274 | 0.830 | 0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.787 | 0.563 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.130 | -0.685 | 0.743 | 0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.973 | 0.292 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.659 | -1.855 | 0.931 | 0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.040 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.009 | 0.007 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.973 | 0.853 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.022 | -1.001 | 0.571 | 0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.095 | 1.251 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.026 | -1.305 | 0.880 | 0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.815 | 0.561 | 0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.137 | -0.711 | 0.771 | 0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.010 | -0.986 | 0.286 | 0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.671 | -1.913 | 0.966 | 0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.034 | 0.028 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.002 | -0.013 | 0.016 | 0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.280 | 0.669 | 1.862 | 0.274 | torch.Size([120]) || stage1.reshape.1.weight + | -0.006 | -0.324 | 0.337 | 0.106 | torch.Size([120]) || stage1.reshape.1.bias + | 0.579 | 0.129 | 1.064 | 0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.039 | -1.100 | 0.894 | 0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.134 | -4.020 | 2.585 | 0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.579 | 0.618 | 0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.000 | -0.319 | 0.279 | 0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.634 | 0.686 | 0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.014 | -0.222 | 0.642 | 0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.066 | 0.928 | 0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.146 | 0.190 | 0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.781 | 0.367 | 1.203 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.029 | -0.378 | 0.545 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.687 | 0.753 | 0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | -0.010 | -0.229 | 0.633 | 0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.674 | 0.669 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.448 | 0.368 | 0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.862 | 0.941 | 0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | -0.004 | -0.267 | 0.594 | 0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.797 | 0.211 | 1.475 | 0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.161 | -1.941 | 0.746 | 0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.296 | -3.927 | 2.840 | 0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.001 | -1.479 | 1.395 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | -0.003 | -0.381 | 0.258 | 0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.526 | 0.561 | 0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.178 | 0.478 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -1.242 | 1.138 | 0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.004 | -0.213 | 0.196 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.702 | 0.349 | 0.904 | 0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.039 | -0.646 | 0.384 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.872 | 0.750 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.049 | -0.353 | 0.135 | 0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.562 | 0.580 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.238 | 0.457 | 0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.828 | 0.685 | 0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | 0.031 | -0.297 | 0.419 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 0.984 | 0.163 | 1.398 | 0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.167 | -1.609 | 0.367 | 0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.343 | -4.484 | 2.362 | 0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | 0.000 | -1.586 | 1.649 | 0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.220 | 0.240 | 0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.378 | 0.514 | 0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | -0.009 | -0.143 | 0.172 | 0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -0.639 | 0.582 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.173 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.733 | 0.277 | 0.903 | 0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.038 | -0.861 | 0.359 | 0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.787 | 0.679 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | -0.029 | -0.365 | 0.143 | 0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.574 | 0.539 | 0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.283 | 0.254 | 0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.998 | 0.522 | 0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.030 | -0.169 | 0.293 | 0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.035 | 0.143 | 1.397 | 0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.161 | -1.413 | 0.084 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.441 | -4.685 | 3.306 | 0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.590 | 1.329 | 0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.266 | 0.232 | 0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.366 | 0.372 | 0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.011 | -0.225 | 0.171 | 0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.660 | 0.801 | 0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.139 | 0.200 | 0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.724 | 0.190 | 0.911 | 0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.038 | -0.981 | 0.285 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.611 | 0.598 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | -0.035 | -0.299 | 0.221 | 0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.502 | 0.520 | 0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.271 | 0.215 | 0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.558 | 0.898 | 0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.424 | 0.190 | 0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.085 | 0.169 | 1.400 | 0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.086 | -1.613 | 0.150 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.541 | -3.902 | 3.728 | 0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.001 | -1.879 | 1.832 | 0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.001 | -0.391 | 0.444 | 0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.407 | 0.448 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | -0.013 | -0.302 | 0.342 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.830 | 0.863 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.117 | 0.094 | 0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.704 | 0.195 | 0.870 | 0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.031 | -1.069 | 0.276 | 0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.656 | 0.555 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | -0.029 | -0.387 | 0.256 | 0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | 0.001 | -0.590 | 0.624 | 0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | -0.011 | -0.277 | 0.303 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -1.124 | 0.539 | 0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | -0.006 | -0.718 | 0.133 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.037 | 0.176 | 1.327 | 0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.112 | -1.591 | 0.177 | 0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.438 | -2.229 | 2.797 | 0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | -0.000 | -2.212 | 1.826 | 0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.343 | 0.338 | 0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.367 | 0.451 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | -0.022 | -0.358 | 0.242 | 0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.922 | 0.886 | 0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.002 | -0.083 | 0.089 | 0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.662 | 0.277 | 0.831 | 0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.025 | -0.959 | 0.261 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.636 | 0.739 | 0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.030 | -0.419 | 0.517 | 0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.615 | 0.709 | 0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | 0.002 | -0.230 | 0.457 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.001 | -1.724 | 1.186 | 0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.019 | -1.909 | 0.255 | 0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.242 | 0.244 | 0.057 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.004 | -0.221 | 0.224 | 0.083 | torch.Size([120]) || stage1.linear1.bias + | 0.737 | 0.334 | 1.046 | 0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.013 | -0.911 | 0.763 | 0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.052 | -2.462 | 2.040 | 0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.785 | 0.767 | 0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.009 | -0.466 | 0.552 | 0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.431 | 0.475 | 0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.796 | 0.497 | 0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.573 | 0.409 | 0.935 | 0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.015 | -0.828 | 0.839 | 0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.604 | 0.542 | 0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.037 | -0.179 | 0.273 | 0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.666 | 0.553 | 0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.001 | -0.416 | 0.396 | 0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.001 | -0.654 | 0.538 | 0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | -0.002 | -0.470 | 0.310 | 0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.951 | 0.342 | 1.189 | 0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.010 | -0.697 | 0.802 | 0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.098 | -2.648 | 2.410 | 0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.733 | 0.886 | 0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.468 | 0.550 | 0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.435 | 0.377 | 0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.359 | 0.258 | 0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.582 | 0.305 | 0.717 | 0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.008 | -0.714 | 0.833 | 0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.732 | 0.501 | 0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.004 | -0.306 | 0.267 | 0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.510 | 0.533 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.315 | 0.291 | 0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.736 | 0.789 | 0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -1.274 | 1.328 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.390 | 0.303 | 0.069 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.010 | -0.219 | 0.227 | 0.087 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.095 | 0.106 | 0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.036 | 0.036 | 0.013 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.136 | 0.141 | 0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.002 | -0.028 | 0.024 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.156 | 0.104 | 0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.008 | -0.055 | 0.045 | 0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.098 | 0.106 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | -0.000 | -0.081 | 0.070 | 0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | -0.000 | -0.375 | 0.279 | 0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | -0.003 | -0.074 | 0.070 | 0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | -0.000 | -0.776 | 0.733 | 0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.021 | -0.239 | 0.513 | 0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.100 | 1.143 | 0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.008 | -0.405 | 0.393 | 0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.963 | 0.899 | 0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | -0.055 | -0.616 | 0.599 | 0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.149 | 0.345 | 1.921 | 0.289 | torch.Size([480]) || stage2.reshape.1.weight + | 0.017 | -0.502 | 0.663 | 0.141 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.609 | 0.736 | 0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight + | 0.006 | -0.136 | 0.404 | 0.077 | torch.Size([120]) || stage2.reshape.2.bias + | 0.686 | 0.172 | 1.113 | 0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.154 | -0.926 | 0.339 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.120 | -1.869 | 4.616 | 0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.514 | 0.499 | 0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.214 | 0.177 | 0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.499 | 0.529 | 0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.004 | -0.171 | 0.556 | 0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.642 | 0.598 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.125 | 0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.592 | 0.325 | 0.794 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.008 | -0.649 | 0.445 | 0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.485 | 0.457 | 0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.053 | -0.240 | 0.171 | 0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.503 | 0.462 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.177 | 0.268 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.690 | 0.498 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.007 | -0.270 | 0.472 | 0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.864 | 0.187 | 1.221 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.146 | -1.128 | 0.299 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.241 | -1.607 | 8.958 | 0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.561 | 0.538 | 0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.198 | 0.222 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | 0.001 | -0.475 | 0.479 | 0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.006 | -0.295 | 0.341 | 0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -0.961 | 0.789 | 0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.105 | 0.143 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.653 | 0.401 | 0.810 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.009 | -0.767 | 0.367 | 0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.499 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.056 | -0.185 | 0.147 | 0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.548 | 0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.002 | -0.231 | 0.177 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.001 | -0.578 | 0.609 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.350 | 0.216 | 0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.848 | 0.172 | 1.107 | 0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.168 | -1.123 | 0.330 | 0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.074 | -1.239 | 4.293 | 0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | -0.001 | -0.643 | 0.531 | 0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.220 | 0.376 | 0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.529 | 0.479 | 0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.230 | 0.295 | 0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.726 | 0.768 | 0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.167 | 0.193 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.695 | 0.334 | 0.833 | 0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.012 | -0.755 | 0.517 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.474 | 0.480 | 0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.049 | -0.218 | 0.148 | 0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.529 | 0.542 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.006 | -0.245 | 0.239 | 0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.541 | 0.485 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.318 | 0.170 | 0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.903 | 0.178 | 1.124 | 0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.138 | -1.223 | 0.440 | 0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.164 | -1.383 | 5.910 | 0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.526 | 0.496 | 0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.250 | 0.273 | 0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.447 | 0.524 | 0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.243 | 0.256 | 0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.551 | 0.730 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.145 | 0.126 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.707 | 0.319 | 0.855 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.013 | -0.839 | 0.507 | 0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.509 | 0.508 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.051 | -0.219 | 0.155 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.475 | 0.592 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.162 | 0.220 | 0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.465 | 0.528 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.243 | 0.286 | 0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.948 | 0.220 | 1.175 | 0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.125 | -1.093 | 0.385 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.150 | -1.632 | 4.522 | 0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.636 | 0.543 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.254 | 0.262 | 0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.632 | 0.628 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | -0.005 | -0.240 | 0.330 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.476 | 0.479 | 0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.112 | 0.134 | 0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.686 | 0.264 | 0.797 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.012 | -0.889 | 0.427 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.476 | 0.478 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.051 | -0.267 | 0.180 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.506 | 0.517 | 0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.172 | 0.241 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.570 | 0.542 | 0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.003 | -0.631 | 0.395 | 0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.912 | 0.189 | 1.122 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.114 | -1.125 | 0.188 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.099 | -1.285 | 1.708 | 0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.496 | 0.540 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | 0.003 | -0.260 | 0.228 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.511 | 0.454 | 0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.711 | 0.286 | 0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.444 | 0.454 | 0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.101 | 0.133 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.668 | 0.312 | 0.800 | 0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.015 | -0.778 | 0.372 | 0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.485 | 0.469 | 0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.045 | -0.294 | 0.173 | 0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.554 | 0.540 | 0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.183 | 0.199 | 0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.879 | 0.824 | 0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.670 | 0.358 | 0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.253 | 0.346 | 0.068 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.007 | -0.248 | 0.241 | 0.103 | torch.Size([120]) || stage2.linear1.bias + | 1.012 | 0.613 | 1.327 | 0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.019 | -0.724 | 0.685 | 0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.003 | -2.959 | 1.705 | 0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.636 | 0.617 | 0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | -0.002 | -0.291 | 0.292 | 0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | -0.002 | -0.476 | 0.512 | 0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.263 | 0.398 | 0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.677 | 0.521 | 0.840 | 0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.010 | -0.710 | 0.541 | 0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.540 | 0.507 | 0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | -0.016 | -0.242 | 0.201 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.519 | 0.479 | 0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.162 | 0.231 | 0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.449 | 0.494 | 0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.293 | 0.222 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.053 | 0.832 | 1.269 | 0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.015 | -0.549 | 0.428 | 0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.007 | -3.099 | 1.550 | 0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.673 | 0.604 | 0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.416 | 0.391 | 0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.569 | 0.560 | 0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | 0.004 | -0.613 | 0.428 | 0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.762 | 0.464 | 0.954 | 0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.005 | -0.745 | 0.381 | 0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.441 | 0.448 | 0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.019 | -0.292 | 0.460 | 0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.491 | 0.490 | 0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.285 | 0.177 | 0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.535 | 0.631 | 0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.011 | -0.765 | 0.337 | 0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.367 | 0.372 | 0.074 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.009 | -0.288 | 0.342 | 0.130 | torch.Size([120]) || stage2.linear2.bias + | 0.000 | -0.112 | 0.093 | 0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | -0.002 | -0.036 | 0.035 | 0.016 | torch.Size([120]) || stage2.pa_deform.bias + | 0.000 | -0.068 | 0.080 | 0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.009 | -0.035 | 0.023 | 0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | 0.000 | -0.068 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.014 | -0.061 | 0.036 | 0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.082 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.003 | -0.075 | 0.069 | 0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | -0.000 | -0.166 | 0.139 | 0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | -0.015 | -0.090 | 0.050 | 0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.002 | -0.642 | 0.663 | 0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.130 | -0.171 | 0.480 | 0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.696 | 0.620 | 0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.007 | -0.337 | 0.301 | 0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.650 | 0.657 | 0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.013 | -0.507 | 0.451 | 0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.067 | 0.372 | 1.778 | 0.269 | torch.Size([480]) || stage3.reshape.1.weight + | -0.004 | -0.699 | 0.521 | 0.227 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.643 | 0.743 | 0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.009 | -0.176 | 0.243 | 0.079 | torch.Size([120]) || stage3.reshape.2.bias + | 0.785 | 0.469 | 1.029 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.102 | -0.716 | 0.311 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.340 | 0.163 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.328 | 0.302 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | 0.004 | -0.232 | 0.189 | 0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.343 | 0.346 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | 0.004 | -0.335 | 0.229 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.366 | 0.325 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.074 | 0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.751 | 0.517 | 0.928 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.002 | -0.271 | 0.189 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.371 | 0.388 | 0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.073 | -0.203 | 0.039 | 0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.400 | 0.401 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.178 | 0.128 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.410 | 0.429 | 0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.006 | -0.345 | 0.304 | 0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.816 | 0.469 | 1.015 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.103 | -0.647 | 0.225 | 0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.464 | 0.239 | 0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.304 | 0.359 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.173 | 0.193 | 0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.299 | 0.408 | 0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.007 | -0.511 | 0.239 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.288 | 0.254 | 0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.060 | 0.054 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.796 | 0.609 | 0.971 | 0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | -0.002 | -0.327 | 0.247 | 0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.379 | 0.407 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.077 | -0.214 | 0.034 | 0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.391 | 0.432 | 0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.176 | 0.112 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.378 | 0.399 | 0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | 0.009 | -0.410 | 0.306 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.854 | 0.447 | 0.995 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.086 | -0.513 | 0.198 | 0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.001 | -0.189 | 0.292 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.390 | 0.367 | 0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.310 | 0.284 | 0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.334 | 0.296 | 0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.356 | 0.299 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.276 | 0.315 | 0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.094 | 0.066 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.829 | 0.673 | 1.017 | 0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.259 | 0.228 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.410 | 0.385 | 0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.085 | -0.200 | 0.017 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.348 | 0.378 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.130 | 0.105 | 0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.346 | 0.425 | 0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | 0.005 | -0.363 | 0.241 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.872 | 0.554 | 1.068 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.057 | -0.402 | 0.133 | 0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | 0.003 | -0.365 | 0.217 | 0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.359 | 0.357 | 0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.265 | 0.294 | 0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.300 | 0.271 | 0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.316 | 0.215 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.370 | 0.329 | 0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.056 | 0.066 | 0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.842 | 0.631 | 0.989 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | -0.001 | -0.216 | 0.263 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.388 | 0.391 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.087 | -0.202 | 0.032 | 0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.364 | 0.428 | 0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.137 | 0.106 | 0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.390 | 0.339 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.376 | 0.203 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.913 | 0.498 | 1.102 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.048 | -0.340 | 0.105 | 0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | 0.001 | -0.706 | 0.306 | 0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.373 | 0.339 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.301 | 0.301 | 0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.278 | 0.277 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | 0.003 | -0.310 | 0.240 | 0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.350 | 0.322 | 0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.064 | 0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.862 | 0.679 | 0.990 | 0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | -0.004 | -0.313 | 0.190 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.370 | 0.364 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.092 | -0.231 | 0.129 | 0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.375 | 0.511 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.114 | 0.114 | 0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.389 | 0.354 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | 0.005 | -0.258 | 0.164 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.899 | 0.480 | 1.089 | 0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.030 | -0.257 | 0.115 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | 0.003 | -0.462 | 0.290 | 0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.391 | 0.365 | 0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | -0.004 | -0.232 | 0.302 | 0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.267 | 0.293 | 0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.250 | 0.182 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.238 | 0.257 | 0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.032 | 0.033 | 0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.864 | 0.651 | 1.029 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.003 | -0.212 | 0.175 | 0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.378 | 0.379 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.097 | -0.308 | 0.026 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.578 | 0.401 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.166 | 0.131 | 0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.358 | 0.376 | 0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.262 | 0.176 | 0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.284 | 0.467 | 0.071 | torch.Size([120, 120]) || stage3.linear1.weight + | 0.006 | -0.201 | 0.269 | 0.090 | torch.Size([120]) || stage3.linear1.bias + | 0.877 | 0.568 | 1.197 | 0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.248 | 0.324 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.261 | 0.125 | 0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.563 | 0.552 | 0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.257 | 0.302 | 0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.390 | 0.385 | 0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.450 | 0.235 | 0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.986 | 0.755 | 1.165 | 0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.260 | 0.169 | 0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.355 | 0.397 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.046 | -0.220 | 0.086 | 0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.424 | 0.368 | 0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.111 | 0.122 | 0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.354 | 0.374 | 0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | 0.001 | -0.374 | 0.272 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.919 | 0.643 | 1.132 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.177 | 0.181 | 0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.332 | 0.131 | 0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.418 | 0.362 | 0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.375 | 0.347 | 0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | -0.001 | -0.294 | 0.354 | 0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | 0.003 | -0.432 | 0.259 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.012 | 0.750 | 1.178 | 0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.001 | -0.171 | 0.155 | 0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.331 | 0.356 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.035 | -0.207 | 0.197 | 0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.399 | 0.398 | 0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.111 | 0.129 | 0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.001 | -0.353 | 0.330 | 0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.328 | 0.127 | 0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.289 | 0.519 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.318 | 0.371 | 0.144 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.086 | 0.095 | 0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | -0.002 | -0.023 | 0.021 | 0.010 | torch.Size([120]) || stage3.pa_deform.bias + | -0.000 | -0.060 | 0.056 | 0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.008 | -0.035 | 0.019 | 0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.001 | -0.064 | 0.062 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.007 | -0.044 | 0.031 | 0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | 0.000 | -0.062 | 0.063 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | -0.006 | -0.052 | 0.043 | 0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | -0.081 | 0.080 | 0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | -0.004 | -0.087 | 0.083 | 0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | -0.002 | -0.465 | 0.513 | 0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.059 | -0.251 | 0.595 | 0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | -0.000 | -0.544 | 0.531 | 0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.001 | -0.589 | 0.433 | 0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | -0.000 | -0.535 | 0.562 | 0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.001 | -0.401 | 0.342 | 0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 0.997 | 0.921 | 1.125 | 0.028 | torch.Size([480]) || stage4.reshape.1.weight + | -0.000 | -0.058 | 0.059 | 0.022 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.155 | 0.150 | 0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight + | 0.001 | -0.016 | 0.016 | 0.006 | torch.Size([120]) || stage4.reshape.2.bias + | 1.002 | 0.999 | 1.009 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.071 | 0.066 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.093 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.009 | 0.009 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.080 | 0.097 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.035 | 0.027 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.080 | 0.079 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.087 | 0.092 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.080 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.031 | 0.029 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.002 | 0.997 | 1.007 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.066 | 0.065 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.006 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.080 | 0.083 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | -0.000 | -0.027 | 0.029 | 0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.077 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.006 | 0.009 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.080 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.077 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.084 | 0.075 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.034 | 0.031 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.996 | 1.008 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.001 | -0.070 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.007 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.080 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | -0.000 | -0.023 | 0.026 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.107 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.000 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.076 | 0.077 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.005 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -2.000 | 0.081 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.084 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.027 | 0.024 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.999 | 1.012 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.064 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.099 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.083 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | -0.000 | -0.019 | 0.018 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.079 | 0.084 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.087 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.079 | 0.082 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.002 | 0.998 | 1.011 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.001 | -0.004 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.089 | 0.081 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.080 | 0.085 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.075 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.000 | -0.021 | 0.016 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.082 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.086 | 0.080 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.076 | 0.081 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.018 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.003 | 0.997 | 1.014 | 0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.001 | -0.005 | 0.004 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.070 | 0.069 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.097 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.075 | 0.089 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.016 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.083 | 0.091 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.093 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.000 | -0.002 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.086 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.079 | 0.092 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | -0.000 | -0.012 | 0.016 | 0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.090 | 0.111 | 0.024 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.001 | -0.019 | 0.029 | 0.009 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 0.999 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.084 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.079 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | 0.000 | -0.021 | 0.024 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.072 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.077 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.102 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.024 | 0.020 | 0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.001 | 0.998 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.078 | 0.096 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.005 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.020 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.085 | 0.082 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.000 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.092 | 0.112 | 0.023 | torch.Size([120, 120]) || stage4.linear2.weight + | 0.000 | -0.032 | 0.049 | 0.015 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.036 | 0.037 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.022 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.002 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | -0.003 | 0.002 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | 0.000 | -0.172 | 0.177 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.002 | -0.027 | 0.088 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.212 | 0.163 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | 0.000 | -0.066 | 0.081 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | 0.000 | -0.413 | 0.387 | 0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.001 | -0.198 | 0.214 | 0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.979 | 0.896 | 1.076 | 0.053 | torch.Size([30]) || stage5.reshape.1.weight + | -0.005 | -0.074 | 0.100 | 0.043 | torch.Size([30]) || stage5.reshape.1.bias + | 0.000 | -0.240 | 0.249 | 0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight + | -0.002 | -0.286 | 0.229 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 1.001 | 0.993 | 1.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.004 | -0.018 | 0.006 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.066 | 0.062 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.086 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.014 | 0.012 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.166 | 0.172 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.053 | 0.045 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.999 | 0.987 | 1.001 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.094 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.022 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.082 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.014 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.075 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.073 | 0.078 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.001 | 0.994 | 1.007 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.004 | -0.016 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.065 | 0.063 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.077 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.022 | 0.017 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.113 | 0.098 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.058 | 0.045 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.080 | 0.080 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.999 | 0.982 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.006 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.076 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.017 | 0.014 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.080 | 0.086 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.014 | 0.016 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.096 | 0.079 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.051 | 0.039 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.998 | 1.009 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.004 | -0.014 | 0.003 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.067 | 0.073 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.085 | 0.087 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.015 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.108 | 0.095 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.043 | 0.039 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.999 | 0.978 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.076 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.012 | 0.019 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.079 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | -0.001 | -0.014 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.076 | 0.082 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.000 | -0.047 | 0.043 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.978 | 1.015 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.004 | -0.013 | 0.004 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.084 | 0.070 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.078 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.014 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.123 | 0.132 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.001 | -0.028 | 0.044 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.082 | 0.089 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.999 | 0.974 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.008 | 0.010 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.088 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.000 | -0.014 | 0.019 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.081 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.020 | 0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.081 | 0.106 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.046 | 0.042 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.944 | 1.017 | 0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.005 | -0.015 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.071 | 0.067 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.085 | 0.090 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.021 | 0.013 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.130 | 0.089 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.036 | 0.024 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.086 | 0.076 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.999 | 0.967 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.006 | 0.007 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.080 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | -0.001 | -0.015 | 0.010 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.081 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.020 | 0.018 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.081 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.037 | 0.050 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.004 | 0.976 | 1.039 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.005 | -0.015 | 0.005 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.070 | 0.076 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.099 | 0.097 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.011 | 0.012 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.084 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.038 | 0.035 | 0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.087 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.998 | 0.960 | 1.002 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.088 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | -0.000 | -0.014 | 0.027 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.081 | 0.074 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.013 | 0.025 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.100 | 0.086 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.000 | -0.022 | 0.030 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.102 | 0.117 | 0.023 | torch.Size([120, 120]) || stage5.linear1.weight + | -0.003 | -0.297 | 0.242 | 0.084 | torch.Size([120]) || stage5.linear1.bias + | 0.999 | 0.971 | 1.008 | 0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.000 | -0.035 | 0.034 | 0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.079 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.087 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.028 | 0.018 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.079 | 0.082 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.146 | 0.171 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.997 | 0.967 | 1.003 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.073 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.002 | -0.017 | 0.008 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.084 | 0.073 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.011 | 0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.103 | 0.140 | 0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.999 | 0.986 | 1.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.035 | 0.034 | 0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.087 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.084 | 0.079 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.024 | 0.024 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.078 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.112 | 0.144 | 0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.998 | 0.965 | 1.004 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.001 | -0.012 | 0.015 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.102 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.012 | 0.009 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.075 | 0.078 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.105 | 0.131 | 0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.220 | 0.209 | 0.035 | torch.Size([120, 120]) || stage5.linear2.weight + | -0.003 | -0.335 | 0.284 | 0.096 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.064 | 0.065 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.001 | -0.050 | 0.050 | 0.029 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.119 | 0.106 | 0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.006 | -0.030 | 0.026 | 0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.055 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | 0.001 | -0.033 | 0.031 | 0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | 0.001 | -0.060 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.040 | 0.037 | 0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.001 | -0.038 | 0.051 | 0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | -0.048 | 0.050 | 0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.334 | 0.340 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.037 | -0.050 | 0.294 | 0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | -0.000 | -0.343 | 0.349 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | -0.001 | -0.237 | 0.244 | 0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | -0.000 | -0.575 | 0.591 | 0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.001 | -0.404 | 0.344 | 0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.254 | 1.058 | 1.466 | 0.126 | torch.Size([30]) || stage6.reshape.1.weight + | -0.001 | -0.074 | 0.093 | 0.041 | torch.Size([30]) || stage6.reshape.1.bias + | 0.000 | -0.734 | 0.625 | 0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.003 | -0.269 | 0.341 | 0.108 | torch.Size([120]) || stage6.reshape.2.bias + | 0.815 | 0.495 | 1.118 | 0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.071 | -0.291 | 0.263 | 0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.080 | 0.087 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.136 | 0.134 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.061 | 0.037 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.201 | 0.182 | 0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.223 | 0.189 | 0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.184 | 0.211 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.049 | 0.069 | 0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.710 | 0.556 | 0.893 | 0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.172 | 0.193 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.217 | 0.211 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.041 | -0.158 | 0.025 | 0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.209 | 0.178 | 0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.141 | 0.186 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.245 | 0.347 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.005 | -0.161 | 0.188 | 0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.780 | 0.582 | 0.963 | 0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.112 | -0.302 | 0.103 | 0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.101 | 0.072 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.112 | 0.178 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.034 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.223 | 0.242 | 0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.149 | 0.105 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.199 | 0.173 | 0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.035 | 0.056 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.744 | 0.530 | 0.917 | 0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.131 | 0.180 | 0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.243 | 0.294 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.039 | -0.217 | 0.045 | 0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.206 | 0.178 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.129 | 0.125 | 0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.236 | 0.276 | 0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.158 | 0.170 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.829 | 0.586 | 1.007 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.101 | -0.353 | 0.132 | 0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.082 | 0.076 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.154 | 0.143 | 0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.041 | 0.038 | 0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.187 | 0.202 | 0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.096 | 0.127 | 0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.203 | 0.185 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.768 | 0.491 | 0.904 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.001 | -0.146 | 0.159 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.184 | 0.204 | 0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.043 | -0.185 | 0.020 | 0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.188 | 0.270 | 0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.152 | 0.134 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.222 | 0.217 | 0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.141 | 0.144 | 0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.820 | 0.554 | 0.976 | 0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.091 | -0.336 | 0.137 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.124 | 0.222 | 0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.157 | 0.175 | 0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.049 | 0.049 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.238 | 0.236 | 0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.077 | 0.074 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.212 | 0.265 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.028 | 0.052 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.768 | 0.530 | 0.903 | 0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.104 | 0.157 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.197 | 0.220 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.042 | -0.155 | 0.043 | 0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.166 | 0.199 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.102 | 0.138 | 0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.241 | 0.256 | 0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.123 | 0.115 | 0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.817 | 0.631 | 0.918 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.295 | 0.141 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.084 | 0.205 | 0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.174 | 0.199 | 0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.060 | 0.081 | 0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.194 | 0.191 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.001 | -0.083 | 0.077 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.218 | 0.243 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.031 | 0.024 | 0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.744 | 0.478 | 0.913 | 0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.003 | -0.146 | 0.110 | 0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.223 | 0.238 | 0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.046 | -0.200 | 0.071 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.168 | 0.201 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.128 | 0.141 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.220 | 0.205 | 0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.086 | 0.094 | 0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.754 | 0.353 | 0.933 | 0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.246 | 0.105 | 0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.113 | 0.536 | 0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.261 | 0.224 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.050 | 0.067 | 0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.234 | 0.256 | 0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | 0.002 | -0.079 | 0.076 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.211 | 0.231 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.033 | 0.030 | 0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.677 | 0.275 | 0.833 | 0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.001 | -0.224 | 0.306 | 0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.196 | 0.211 | 0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.061 | -0.289 | 0.136 | 0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.271 | 0.312 | 0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.166 | 0.155 | 0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.286 | 0.375 | 0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.054 | 0.137 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.174 | 0.172 | 0.039 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.002 | -0.275 | 0.348 | 0.113 | torch.Size([120]) || stage6.linear1.bias + | 0.704 | 0.402 | 1.002 | 0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.001 | -0.466 | 0.407 | 0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.172 | 0.570 | 0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.337 | 0.378 | 0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.071 | 0.068 | 0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.290 | 0.321 | 0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | 0.001 | -0.255 | 0.250 | 0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.695 | 0.353 | 0.966 | 0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | -0.001 | -0.218 | 0.165 | 0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.259 | 0.255 | 0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.044 | -0.256 | 0.042 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.234 | 0.214 | 0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.002 | -0.133 | 0.091 | 0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.333 | 0.296 | 0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | 0.003 | -0.238 | 0.280 | 0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.671 | 0.425 | 0.980 | 0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.261 | 0.305 | 0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.372 | 0.942 | 0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.450 | 0.494 | 0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.133 | 0.119 | 0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.239 | 0.288 | 0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.187 | 0.157 | 0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.687 | 0.160 | 0.907 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | -0.002 | -0.192 | 0.222 | 0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.257 | 0.426 | 0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.064 | -0.207 | 0.036 | 0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.269 | 0.224 | 0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.126 | 0.129 | 0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.308 | 0.298 | 0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.180 | 0.192 | 0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.297 | 0.368 | 0.069 | torch.Size([120, 120]) || stage6.linear2.weight + | 0.001 | -0.431 | 0.480 | 0.189 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.100 | 0.104 | 0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.001 | -0.018 | 0.029 | 0.010 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.105 | 0.111 | 0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.033 | 0.024 | 0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.071 | 0.067 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.003 | -0.061 | 0.043 | 0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.000 | -0.074 | 0.068 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.001 | -0.075 | 0.056 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.001 | -0.124 | 0.108 | 0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | -0.001 | -0.113 | 0.076 | 0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.001 | -0.517 | 0.524 | 0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.154 | -0.305 | 0.679 | 0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | 0.000 | -0.680 | 0.728 | 0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.020 | -0.514 | 0.417 | 0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.587 | 0.737 | 0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.015 | -0.437 | 0.490 | 0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.284 | 1.119 | 1.404 | 0.055 | torch.Size([30]) || stage7.reshape.1.weight + | -0.014 | -0.286 | 0.184 | 0.122 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.521 | 0.576 | 0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight + | 0.004 | -0.387 | 0.738 | 0.175 | torch.Size([120]) || stage7.reshape.2.bias + | 0.440 | 0.099 | 0.775 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.177 | -0.670 | 0.319 | 0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.055 | -2.159 | 1.979 | 0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.535 | 0.554 | 0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.193 | 0.281 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.397 | 0.395 | 0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.232 | 0.692 | 0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.899 | 1.073 | 0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.122 | 0.104 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.310 | 0.157 | 0.440 | 0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.006 | -0.474 | 0.266 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.605 | 0.490 | 0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.101 | -0.310 | 0.126 | 0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.448 | 0.475 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | 0.006 | -0.185 | 0.215 | 0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.465 | 0.512 | 0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.150 | 0.417 | 0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.577 | 0.165 | 0.829 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.136 | -0.849 | 0.206 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.143 | -3.020 | 4.621 | 0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.647 | 0.640 | 0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.356 | 0.382 | 0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.457 | 0.378 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.250 | 0.707 | 0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.055 | 1.091 | 0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.093 | 0.123 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.411 | 0.265 | 0.535 | 0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.008 | -0.630 | 0.264 | 0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.501 | 0.506 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.087 | -0.341 | 0.140 | 0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.450 | 0.527 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.188 | 0.171 | 0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.554 | 0.546 | 0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.000 | -0.135 | 0.220 | 0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.655 | 0.134 | 0.896 | 0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.139 | -0.788 | 0.181 | 0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.062 | -3.469 | 3.276 | 0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.592 | 0.650 | 0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.308 | 0.218 | 0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.355 | 0.345 | 0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.213 | 0.700 | 0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -1.166 | 0.942 | 0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.093 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.466 | 0.317 | 0.565 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.014 | -0.657 | 0.280 | 0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.541 | 0.494 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.079 | -0.335 | 0.122 | 0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.513 | 0.493 | 0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.180 | 0.175 | 0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.509 | 0.479 | 0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.004 | -0.093 | 0.293 | 0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.693 | 0.147 | 0.945 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.132 | -0.906 | 0.249 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.108 | -3.576 | 4.241 | 0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.945 | 1.095 | 0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.274 | 0.204 | 0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | -0.001 | -0.379 | 0.351 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.211 | 0.587 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -1.269 | 1.067 | 0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.117 | 0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.499 | 0.285 | 0.570 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.012 | -0.567 | 0.273 | 0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.528 | 0.499 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.084 | -0.349 | 0.141 | 0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.547 | 0.592 | 0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | 0.002 | -0.154 | 0.176 | 0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.520 | 0.480 | 0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.001 | -0.150 | 0.207 | 0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.726 | 0.137 | 1.004 | 0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.122 | -0.907 | 0.180 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.078 | -3.824 | 4.241 | 0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -1.188 | 0.796 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.248 | 0.207 | 0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | -0.001 | -0.409 | 0.369 | 0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.224 | 0.322 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -1.744 | 1.273 | 0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.113 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.514 | 0.277 | 0.614 | 0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.016 | -0.621 | 0.286 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.517 | 0.453 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.064 | -0.260 | 0.143 | 0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.503 | 0.554 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.004 | -0.232 | 0.193 | 0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.595 | 0.543 | 0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.196 | 0.198 | 0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.731 | 0.152 | 1.075 | 0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.076 | -1.003 | 0.176 | 0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.121 | -3.281 | 4.671 | 0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.640 | 1.083 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.239 | 0.314 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.344 | 0.452 | 0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.361 | 0.251 | 0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.637 | 0.806 | 0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.088 | 0.091 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.514 | 0.238 | 0.594 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.017 | -0.650 | 0.162 | 0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.442 | 0.479 | 0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.040 | -0.400 | 0.203 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.541 | 0.514 | 0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | -0.008 | -0.319 | 0.309 | 0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -1.018 | 1.398 | 0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.606 | 0.269 | 0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.186 | 0.207 | 0.048 | torch.Size([120, 120]) || stage7.linear1.weight + | 0.010 | -0.448 | 0.437 | 0.161 | torch.Size([120]) || stage7.linear1.bias + | 0.703 | 0.381 | 0.856 | 0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.014 | -0.645 | 0.486 | 0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.007 | -4.468 | 1.008 | 0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.625 | 0.834 | 0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | -0.009 | -0.737 | 0.632 | 0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.403 | 0.406 | 0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.338 | 0.165 | 0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.435 | 0.323 | 0.526 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.678 | 0.379 | 0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.465 | 0.467 | 0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | -0.031 | -0.236 | 0.180 | 0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.490 | 0.520 | 0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.197 | 0.242 | 0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.525 | 0.501 | 0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.005 | -0.431 | 0.164 | 0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.703 | 0.306 | 0.866 | 0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.009 | -0.647 | 0.481 | 0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.010 | -3.504 | 1.842 | 0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.639 | 0.590 | 0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.613 | 0.609 | 0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.316 | 0.325 | 0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | -0.004 | -0.350 | 0.145 | 0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.452 | 0.309 | 0.558 | 0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.003 | -0.661 | 0.246 | 0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.580 | 0.410 | 0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | -0.020 | -0.258 | 0.299 | 0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.561 | 0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.234 | 0.434 | 0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.778 | 0.581 | 0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.888 | 0.286 | 0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.348 | 0.237 | 0.060 | torch.Size([120, 120]) || stage7.linear2.weight + | 0.023 | -0.390 | 0.506 | 0.167 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.104 | 0.107 | 0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.002 | -0.041 | 0.035 | 0.016 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.123 | 0.109 | 0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.002 | -0.034 | 0.032 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.111 | 0.084 | 0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.008 | -0.073 | 0.081 | 0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.154 | 0.122 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.014 | -0.041 | 0.068 | 0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | -0.001 | -0.408 | 0.365 | 0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | -0.003 | -0.057 | 0.054 | 0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | 0.000 | -0.697 | 0.606 | 0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.119 | -0.211 | 0.720 | 0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.000 | -1.175 | 0.924 | 0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.000 | -0.581 | 0.580 | 0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.001 | -0.786 | 0.874 | 0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | -0.053 | -0.522 | 0.577 | 0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.225 | 1.000 | 1.516 | 0.095 | torch.Size([120]) || stage8.0.1.weight + | -0.013 | -0.413 | 0.465 | 0.139 | torch.Size([120]) || stage8.0.1.bias + | 0.000 | -2.505 | 0.627 | 0.136 | torch.Size([180, 120]) || stage8.0.2.weight + | 0.005 | -0.397 | 0.377 | 0.107 | torch.Size([180]) || stage8.0.2.bias + | 0.456 | 0.123 | 0.760 | 0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.022 | -0.343 | 0.875 | 0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.014 | -1.907 | 2.592 | 0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.632 | 0.628 | 0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | 0.006 | -0.567 | 0.668 | 0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.477 | 0.447 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | -0.010 | -0.460 | 0.225 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.429 | 0.119 | 0.634 | 0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.007 | -0.338 | 0.803 | 0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | -0.006 | -0.572 | 0.539 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.060 | -0.260 | 0.185 | 0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.461 | 0.548 | 0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.163 | 0.183 | 0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.757 | 0.581 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.191 | 0.121 | 0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.557 | 0.086 | 0.800 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.029 | -0.230 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | -0.016 | -2.004 | 1.711 | 0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.690 | 0.575 | 0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | 0.011 | -0.641 | 0.609 | 0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.466 | 0.401 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.344 | 0.181 | 0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.503 | 0.226 | 0.742 | 0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.009 | -0.404 | 0.818 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.007 | -0.595 | 0.532 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.068 | -0.261 | 0.071 | 0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.573 | 0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.129 | 0.197 | 0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.556 | 0.582 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | -0.003 | -0.170 | 0.145 | 0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.699 | 0.202 | 0.912 | 0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.033 | -0.253 | 0.924 | 0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.030 | -2.510 | 2.088 | 0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.637 | 0.801 | 0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.006 | -0.512 | 0.520 | 0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.381 | 0.337 | 0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | -0.011 | -0.238 | 0.234 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.594 | 0.150 | 0.810 | 0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.010 | -0.483 | 0.726 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.006 | -0.567 | 0.499 | 0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.077 | -0.360 | 0.050 | 0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.536 | 0.673 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.142 | 0.186 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.536 | 0.524 | 0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | -0.006 | -0.147 | 0.133 | 0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.683 | 0.141 | 0.908 | 0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.033 | -0.199 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | -0.039 | -1.527 | 3.891 | 0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.682 | 0.693 | 0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | 0.007 | -0.543 | 0.513 | 0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.390 | 0.476 | 0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.007 | -0.176 | 0.150 | 0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.640 | 0.094 | 0.853 | 0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.009 | -0.372 | 0.683 | 0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.006 | -0.628 | 0.521 | 0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.089 | -0.367 | 0.047 | 0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.629 | 0.562 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.186 | 0.128 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.485 | 0.499 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.138 | 0.209 | 0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.294 | 0.577 | 0.071 | torch.Size([180, 180]) || stage8.1.linear.weight + | 0.004 | -0.349 | 0.235 | 0.072 | torch.Size([180]) || stage8.1.linear.bias + | 0.708 | 0.242 | 1.026 | 0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.032 | -0.212 | 0.830 | 0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | -0.039 | -1.954 | 2.394 | 0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.922 | 0.646 | 0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.429 | 0.524 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.467 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | -0.005 | -0.339 | 0.264 | 0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.587 | 0.255 | 0.837 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.011 | -0.285 | 0.721 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.006 | -0.586 | 0.534 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.075 | -0.225 | 0.066 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.493 | 0.532 | 0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.189 | 0.178 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.551 | 0.543 | 0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.010 | -0.154 | 0.142 | 0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.773 | 0.210 | 1.004 | 0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.035 | -0.176 | 0.873 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.027 | -2.407 | 1.736 | 0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.817 | 0.977 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.659 | 0.461 | 0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.484 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.014 | -0.315 | 0.252 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.641 | 0.337 | 0.810 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.011 | -0.177 | 0.806 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | -0.006 | -0.569 | 0.598 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.079 | -0.323 | 0.071 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.512 | 0.577 | 0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.003 | -0.142 | 0.161 | 0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.529 | 0.572 | 0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | -0.010 | -0.178 | 0.159 | 0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.857 | 0.199 | 1.153 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.039 | -0.189 | 0.943 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.042 | -1.962 | 2.773 | 0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.783 | 0.655 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.004 | -0.338 | 0.533 | 0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.497 | 0.461 | 0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.288 | 0.183 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.681 | 0.327 | 0.878 | 0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.012 | -0.178 | 0.773 | 0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.006 | -0.789 | 0.546 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.081 | -0.249 | 0.036 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.526 | 0.555 | 0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.000 | -0.133 | 0.191 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.572 | 0.529 | 0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.164 | 0.147 | 0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.877 | 0.198 | 1.043 | 0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.038 | -0.210 | 0.916 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.094 | -2.974 | 4.987 | 0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.964 | 1.011 | 0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.404 | 0.429 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.501 | 0.489 | 0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | -0.021 | -0.305 | 0.208 | 0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.697 | 0.295 | 0.894 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.015 | -0.241 | 0.712 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | -0.005 | -0.562 | 0.573 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.085 | -0.302 | 0.080 | 0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.734 | 0.573 | 0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.150 | 0.161 | 0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.671 | 0.623 | 0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | -0.023 | -0.252 | 0.317 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.278 | 0.345 | 0.064 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.004 | -0.315 | 0.148 | 0.064 | torch.Size([180]) || stage8.2.linear.bias + | 0.850 | 0.326 | 1.087 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.031 | -0.334 | 0.779 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.012 | -2.917 | 1.476 | 0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.603 | 0.666 | 0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.374 | 0.381 | 0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.577 | 0.605 | 0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | -0.008 | -0.394 | 0.499 | 0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.636 | 0.321 | 0.790 | 0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.013 | -0.294 | 0.774 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.004 | -0.540 | 0.539 | 0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.065 | -0.212 | 0.047 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.608 | 0.603 | 0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.002 | -0.177 | 0.155 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.573 | 0.630 | 0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.005 | -0.189 | 0.178 | 0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.899 | 0.275 | 1.048 | 0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.031 | -0.223 | 0.771 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.003 | -3.151 | 1.718 | 0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.732 | 0.868 | 0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.412 | 0.350 | 0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.466 | 0.487 | 0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | -0.006 | -0.388 | 0.400 | 0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.711 | 0.381 | 0.864 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.009 | -0.240 | 0.692 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.005 | -0.657 | 0.639 | 0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.077 | -0.263 | 0.047 | 0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.673 | 0.605 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.158 | 0.155 | 0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.582 | 0.585 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.009 | -0.253 | 0.178 | 0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 0.941 | 0.262 | 1.154 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.032 | -0.162 | 0.906 | 0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.005 | -3.421 | 1.350 | 0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.777 | 0.735 | 0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.355 | 0.421 | 0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.479 | 0.475 | 0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | -0.013 | -0.292 | 0.345 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.743 | 0.242 | 0.919 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.011 | -0.214 | 0.691 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.005 | -0.633 | 0.498 | 0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.082 | -0.346 | 0.087 | 0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.591 | 0.670 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.190 | 0.151 | 0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.560 | 0.637 | 0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | -0.009 | -0.226 | 0.250 | 0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 0.950 | 0.250 | 1.103 | 0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.035 | -0.196 | 0.925 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | -0.026 | -3.591 | 5.653 | 0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.753 | 0.637 | 0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | 0.000 | -0.333 | 0.432 | 0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | 0.001 | -0.591 | 0.591 | 0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | -0.014 | -0.348 | 0.267 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.735 | 0.254 | 0.893 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.011 | -0.241 | 0.659 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.005 | -0.628 | 0.667 | 0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.076 | -0.411 | 0.113 | 0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.662 | 0.578 | 0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.208 | 0.169 | 0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.602 | 0.588 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | -0.011 | -0.218 | 0.232 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.343 | 0.316 | 0.065 | torch.Size([180, 180]) || stage8.3.linear.weight + | 0.010 | -0.297 | 0.187 | 0.061 | torch.Size([180]) || stage8.3.linear.bias + | 1.012 | 0.330 | 1.282 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.030 | -0.347 | 0.800 | 0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.013 | -2.816 | 3.792 | 0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.807 | 0.825 | 0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.429 | 0.319 | 0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.553 | 0.569 | 0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.019 | -0.443 | 0.441 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.638 | 0.420 | 0.797 | 0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.018 | -0.222 | 0.886 | 0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.576 | 0.510 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.018 | -0.277 | 0.123 | 0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.687 | 0.625 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | -0.007 | -0.264 | 0.267 | 0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | 0.001 | -0.639 | 0.705 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.012 | -0.255 | 0.274 | 0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.092 | 0.475 | 1.341 | 0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.030 | -0.294 | 0.686 | 0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.018 | -3.165 | 0.990 | 0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.695 | 0.699 | 0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.319 | 0.286 | 0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.542 | 0.519 | 0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | -0.017 | -0.439 | 0.451 | 0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.664 | 0.366 | 0.835 | 0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.015 | -0.217 | 0.985 | 0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.641 | 0.563 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.022 | -0.381 | 0.161 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.571 | 0.642 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | 0.003 | -0.279 | 0.311 | 0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.738 | 0.633 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.254 | 0.261 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.125 | 0.525 | 1.405 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.033 | -0.186 | 0.627 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | 0.028 | -3.477 | 0.957 | 0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.663 | 0.658 | 0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.007 | -0.357 | 0.255 | 0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.596 | 0.578 | 0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.018 | -0.506 | 0.389 | 0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.694 | 0.319 | 0.865 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.018 | -0.150 | 0.975 | 0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.002 | -0.619 | 0.565 | 0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.025 | -0.345 | 0.208 | 0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.624 | 0.607 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.388 | 0.290 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.927 | 0.675 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.325 | 0.240 | 0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.108 | 0.535 | 1.297 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.035 | -0.213 | 0.546 | 0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | 0.020 | -3.042 | 1.420 | 0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.697 | 0.700 | 0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | -0.000 | -0.220 | 0.311 | 0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.652 | 0.592 | 0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | -0.019 | -0.535 | 0.426 | 0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.685 | 0.225 | 0.893 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.023 | -0.211 | 0.938 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.501 | 0.564 | 0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | -0.014 | -0.339 | 0.237 | 0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.560 | 0.626 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.000 | -0.231 | 0.239 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.544 | 0.657 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.271 | 0.274 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.473 | 0.481 | 0.069 | torch.Size([180, 180]) || stage8.4.linear.weight + | 0.029 | -0.333 | 0.194 | 0.076 | torch.Size([180]) || stage8.4.linear.bias + | 1.025 | 0.297 | 1.336 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.034 | -0.429 | 0.872 | 0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.574 | -4.515 | 3.381 | 0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.771 | 0.886 | 0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | 0.000 | -0.356 | 0.521 | 0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | -0.001 | -0.632 | 0.656 | 0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.029 | -0.329 | 0.697 | 0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.777 | 0.446 | 0.952 | 0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.022 | -0.335 | 0.920 | 0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.002 | -0.520 | 0.598 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.013 | -0.456 | 0.200 | 0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.677 | 0.642 | 0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.005 | -0.272 | 0.233 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.762 | 0.598 | 0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.025 | -0.244 | 0.583 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.021 | 0.261 | 1.261 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.033 | -0.358 | 0.867 | 0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.550 | -3.274 | 4.406 | 0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.819 | 0.986 | 0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.510 | 0.446 | 0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | -0.003 | -0.739 | 0.682 | 0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.032 | -0.318 | 0.607 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 0.823 | 0.420 | 0.950 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.021 | -0.274 | 0.882 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | -0.002 | -0.496 | 0.532 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | -0.028 | -0.260 | 0.194 | 0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.620 | 0.586 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.284 | 0.423 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.774 | 0.614 | 0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.028 | -0.371 | 0.561 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.096 | 0.377 | 1.321 | 0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.033 | -0.244 | 0.755 | 0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.441 | -3.439 | 5.870 | 0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.710 | 0.679 | 0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | 0.003 | -0.277 | 0.283 | 0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.824 | 0.684 | 0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.033 | -0.390 | 0.545 | 0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 0.843 | 0.390 | 0.984 | 0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.022 | -0.211 | 0.854 | 0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.002 | -0.522 | 0.503 | 0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | -0.024 | -0.243 | 0.219 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | -0.001 | -0.638 | 0.617 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.268 | 0.380 | 0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.713 | 0.769 | 0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.034 | -0.372 | 0.592 | 0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.027 | 0.318 | 1.206 | 0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.033 | -0.187 | 0.768 | 0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.347 | -2.664 | 2.684 | 0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.677 | 0.676 | 0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.002 | -0.410 | 0.354 | 0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.630 | 0.725 | 0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.041 | -0.385 | 0.660 | 0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 0.849 | 0.390 | 0.985 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.023 | -0.163 | 0.810 | 0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | -0.002 | -0.547 | 0.536 | 0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | -0.012 | -0.366 | 0.252 | 0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.669 | 0.597 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.216 | 0.202 | 0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.700 | 0.674 | 0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.032 | -0.376 | 0.666 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.299 | 0.469 | 0.069 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.081 | -0.562 | 0.263 | 0.109 | torch.Size([180]) || stage8.5.linear.bias + | 1.111 | 0.208 | 1.434 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.048 | -0.547 | 0.851 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.252 | -2.157 | 6.293 | 0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.664 | 0.631 | 0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | 0.007 | -0.293 | 0.366 | 0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.701 | 0.726 | 0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | 0.030 | -0.318 | 0.331 | 0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 0.959 | 0.475 | 1.322 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.039 | -0.421 | 0.873 | 0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.002 | -0.550 | 0.783 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.269 | 0.152 | 0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.914 | 0.839 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.340 | 0.304 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.592 | 0.713 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.002 | -0.535 | 0.384 | 0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.123 | 0.183 | 1.352 | 0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.047 | -0.513 | 0.903 | 0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.234 | -1.968 | 6.366 | 0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.751 | 0.759 | 0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.300 | 0.214 | 0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.657 | 0.699 | 0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | 0.031 | -0.321 | 0.293 | 0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 0.986 | 0.416 | 1.360 | 0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.038 | -0.393 | 0.807 | 0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.001 | -0.589 | 0.620 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.005 | -0.316 | 0.229 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.738 | 0.766 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.252 | 0.302 | 0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.674 | 0.629 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.475 | 0.441 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.097 | 0.342 | 1.294 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.054 | -0.639 | 0.904 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.135 | -3.252 | 1.238 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.672 | 0.663 | 0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.007 | -0.170 | 0.228 | 0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.660 | 0.651 | 0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.031 | -0.360 | 0.322 | 0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.004 | 0.360 | 1.381 | 0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.042 | -0.447 | 0.808 | 0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.000 | -0.600 | 0.603 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.022 | -0.447 | 0.249 | 0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.666 | 0.708 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | -0.002 | -0.326 | 0.272 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | -0.001 | -0.653 | 0.719 | 0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.488 | 0.321 | 0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.095 | 0.272 | 1.302 | 0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.052 | -0.557 | 1.069 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.196 | -2.349 | 1.401 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.741 | 0.657 | 0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.186 | 0.141 | 0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.669 | 0.671 | 0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.004 | -0.323 | 0.300 | 0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 0.999 | 0.383 | 1.380 | 0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.044 | -0.392 | 0.694 | 0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | 0.000 | -0.577 | 0.857 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.041 | -0.394 | 0.238 | 0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.924 | 0.828 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.214 | 0.407 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.827 | 0.755 | 0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.022 | -0.296 | 0.262 | 0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | 0.002 | -1.059 | 1.262 | 0.089 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.031 | -0.789 | 0.427 | 0.120 | torch.Size([180]) || stage8.6.linear.bias + | 0.389 | 0.079 | 1.137 | 0.176 | torch.Size([180]) || norm.weight + | -0.021 | -0.669 | 0.888 | 0.127 | torch.Size([180]) || norm.bias + | 0.000 | -0.486 | 0.568 | 0.103 | torch.Size([120, 180]) || conv_after_body.weight + | -0.000 | -0.167 | 0.168 | 0.055 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -1.782 | 1.300 | 0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.019 | -0.542 | 0.437 | 0.162 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.001 | -1.915 | 1.372 | 0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.045 | -0.281 | 0.215 | 0.097 | torch.Size([256]) || upsample.0.bias + | -0.006 | -4.826 | 0.582 | 0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.154 | -0.441 | 0.187 | 0.100 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.210 | 0.246 | 0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.000 | -0.013 | 0.007 | 0.003 | torch.Size([64]) || upsample.10.bias + | 0.000 | -0.044 | 0.042 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:52:19.525 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/train/train_sharp + dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4 + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 6 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:52:19.571 : Number of train images: 24,000, iters: 3,000 +22-03-11 10:52:33.932 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:52:34.115 : + | mean | min | max | std || shape + | -0.000 | -1.462 | 1.580 | 0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | 0.005 | -0.950 | 0.885 | 0.268 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.679 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.042 | -0.894 | 0.351 | 0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.008 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.059 | -1.268 | 0.732 | 0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.159 | -0.704 | 0.859 | 0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.780 | -1.061 | 1.162 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.144 | 0.163 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.001 | -0.003 | 0.005 | 0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.726 | 0.773 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.021 | -0.814 | 0.355 | 0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.380 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.038 | -1.207 | 0.714 | 0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.462 | 0.549 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.157 | -0.742 | 0.980 | 0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.648 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.775 | -1.195 | 1.148 | 0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | -0.000 | -0.122 | 0.152 | 0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | -0.000 | -0.002 | 0.001 | 0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.956 | 0.870 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.025 | -1.040 | 0.512 | 0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.195 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.023 | -1.284 | 0.699 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.009 | -1.831 | 0.616 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.120 | -0.695 | 0.755 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.013 | -1.285 | 0.304 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.681 | -1.725 | 0.942 | 0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.045 | 0.071 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.009 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.995 | 0.879 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.040 | -1.137 | 0.617 | 0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.891 | 1.224 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.022 | -1.287 | 0.745 | 0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.802 | 0.561 | 0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.118 | -0.694 | 0.697 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.107 | 0.306 | 0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.658 | -1.792 | 0.905 | 0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.030 | 0.037 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.003 | -0.001 | 0.007 | 0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.990 | 0.880 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.010 | -1.067 | 0.596 | 0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.061 | 1.229 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.024 | -1.274 | 0.830 | 0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.787 | 0.563 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.130 | -0.685 | 0.743 | 0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.973 | 0.292 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.659 | -1.855 | 0.931 | 0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.040 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.009 | 0.007 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.973 | 0.853 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.022 | -1.001 | 0.571 | 0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.095 | 1.251 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.026 | -1.305 | 0.880 | 0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.815 | 0.561 | 0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.137 | -0.711 | 0.771 | 0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.010 | -0.986 | 0.286 | 0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.671 | -1.913 | 0.966 | 0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.034 | 0.028 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.002 | -0.013 | 0.016 | 0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.280 | 0.669 | 1.862 | 0.274 | torch.Size([120]) || stage1.reshape.1.weight + | -0.006 | -0.324 | 0.337 | 0.106 | torch.Size([120]) || stage1.reshape.1.bias + | 0.579 | 0.129 | 1.064 | 0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.039 | -1.100 | 0.894 | 0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.134 | -4.020 | 2.585 | 0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.579 | 0.618 | 0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.000 | -0.319 | 0.279 | 0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.634 | 0.686 | 0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.014 | -0.222 | 0.642 | 0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.066 | 0.928 | 0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.146 | 0.190 | 0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.781 | 0.367 | 1.203 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.029 | -0.378 | 0.545 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.687 | 0.753 | 0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | -0.010 | -0.229 | 0.633 | 0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.674 | 0.669 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.448 | 0.368 | 0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.862 | 0.941 | 0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | -0.004 | -0.267 | 0.594 | 0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.797 | 0.211 | 1.475 | 0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.161 | -1.941 | 0.746 | 0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.296 | -3.927 | 2.840 | 0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.001 | -1.479 | 1.395 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | -0.003 | -0.381 | 0.258 | 0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.526 | 0.561 | 0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.178 | 0.478 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -1.242 | 1.138 | 0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.004 | -0.213 | 0.196 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.702 | 0.349 | 0.904 | 0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.039 | -0.646 | 0.384 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.872 | 0.750 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.049 | -0.353 | 0.135 | 0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.562 | 0.580 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.238 | 0.457 | 0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.828 | 0.685 | 0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | 0.031 | -0.297 | 0.419 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 0.984 | 0.163 | 1.398 | 0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.167 | -1.609 | 0.367 | 0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.343 | -4.484 | 2.362 | 0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | 0.000 | -1.586 | 1.649 | 0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.220 | 0.240 | 0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.378 | 0.514 | 0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | -0.009 | -0.143 | 0.172 | 0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -0.639 | 0.582 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.173 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.733 | 0.277 | 0.903 | 0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.038 | -0.861 | 0.359 | 0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.787 | 0.679 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | -0.029 | -0.365 | 0.143 | 0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.574 | 0.539 | 0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.283 | 0.254 | 0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.998 | 0.522 | 0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.030 | -0.169 | 0.293 | 0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.035 | 0.143 | 1.397 | 0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.161 | -1.413 | 0.084 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.441 | -4.685 | 3.306 | 0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.590 | 1.329 | 0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.266 | 0.232 | 0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.366 | 0.372 | 0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.011 | -0.225 | 0.171 | 0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.660 | 0.801 | 0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.139 | 0.200 | 0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.724 | 0.190 | 0.911 | 0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.038 | -0.981 | 0.285 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.611 | 0.598 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | -0.035 | -0.299 | 0.221 | 0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.502 | 0.520 | 0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.271 | 0.215 | 0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.558 | 0.898 | 0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.424 | 0.190 | 0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.085 | 0.169 | 1.400 | 0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.086 | -1.613 | 0.150 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.541 | -3.902 | 3.728 | 0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.001 | -1.879 | 1.832 | 0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.001 | -0.391 | 0.444 | 0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.407 | 0.448 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | -0.013 | -0.302 | 0.342 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.830 | 0.863 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.117 | 0.094 | 0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.704 | 0.195 | 0.870 | 0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.031 | -1.069 | 0.276 | 0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.656 | 0.555 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | -0.029 | -0.387 | 0.256 | 0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | 0.001 | -0.590 | 0.624 | 0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | -0.011 | -0.277 | 0.303 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -1.124 | 0.539 | 0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | -0.006 | -0.718 | 0.133 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.037 | 0.176 | 1.327 | 0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.112 | -1.591 | 0.177 | 0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.438 | -2.229 | 2.797 | 0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | -0.000 | -2.212 | 1.826 | 0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.343 | 0.338 | 0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.367 | 0.451 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | -0.022 | -0.358 | 0.242 | 0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.922 | 0.886 | 0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.002 | -0.083 | 0.089 | 0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.662 | 0.277 | 0.831 | 0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.025 | -0.959 | 0.261 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.636 | 0.739 | 0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.030 | -0.419 | 0.517 | 0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.615 | 0.709 | 0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | 0.002 | -0.230 | 0.457 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.001 | -1.724 | 1.186 | 0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.019 | -1.909 | 0.255 | 0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.242 | 0.244 | 0.057 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.004 | -0.221 | 0.224 | 0.083 | torch.Size([120]) || stage1.linear1.bias + | 0.737 | 0.334 | 1.046 | 0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.013 | -0.911 | 0.763 | 0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.052 | -2.462 | 2.040 | 0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.785 | 0.767 | 0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.009 | -0.466 | 0.552 | 0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.431 | 0.475 | 0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.796 | 0.497 | 0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.573 | 0.409 | 0.935 | 0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.015 | -0.828 | 0.839 | 0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.604 | 0.542 | 0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.037 | -0.179 | 0.273 | 0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.666 | 0.553 | 0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.001 | -0.416 | 0.396 | 0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.001 | -0.654 | 0.538 | 0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | -0.002 | -0.470 | 0.310 | 0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.951 | 0.342 | 1.189 | 0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.010 | -0.697 | 0.802 | 0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.098 | -2.648 | 2.410 | 0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.733 | 0.886 | 0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.468 | 0.550 | 0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.435 | 0.377 | 0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.359 | 0.258 | 0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.582 | 0.305 | 0.717 | 0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.008 | -0.714 | 0.833 | 0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.732 | 0.501 | 0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.004 | -0.306 | 0.267 | 0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.510 | 0.533 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.315 | 0.291 | 0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.736 | 0.789 | 0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -1.274 | 1.328 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.390 | 0.303 | 0.069 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.010 | -0.219 | 0.227 | 0.087 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.095 | 0.106 | 0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.036 | 0.036 | 0.013 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.136 | 0.141 | 0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.002 | -0.028 | 0.024 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.156 | 0.104 | 0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.008 | -0.055 | 0.045 | 0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.098 | 0.106 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | -0.000 | -0.081 | 0.070 | 0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | -0.000 | -0.375 | 0.279 | 0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | -0.003 | -0.074 | 0.070 | 0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | -0.000 | -0.776 | 0.733 | 0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.021 | -0.239 | 0.513 | 0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.100 | 1.143 | 0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.008 | -0.405 | 0.393 | 0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.963 | 0.899 | 0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | -0.055 | -0.616 | 0.599 | 0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.149 | 0.345 | 1.921 | 0.289 | torch.Size([480]) || stage2.reshape.1.weight + | 0.017 | -0.502 | 0.663 | 0.141 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.609 | 0.736 | 0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight + | 0.006 | -0.136 | 0.404 | 0.077 | torch.Size([120]) || stage2.reshape.2.bias + | 0.686 | 0.172 | 1.113 | 0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.154 | -0.926 | 0.339 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.120 | -1.869 | 4.616 | 0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.514 | 0.499 | 0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.214 | 0.177 | 0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.499 | 0.529 | 0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.004 | -0.171 | 0.556 | 0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.642 | 0.598 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.125 | 0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.592 | 0.325 | 0.794 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.008 | -0.649 | 0.445 | 0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.485 | 0.457 | 0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.053 | -0.240 | 0.171 | 0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.503 | 0.462 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.177 | 0.268 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.690 | 0.498 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.007 | -0.270 | 0.472 | 0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.864 | 0.187 | 1.221 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.146 | -1.128 | 0.299 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.241 | -1.607 | 8.958 | 0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.561 | 0.538 | 0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.198 | 0.222 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | 0.001 | -0.475 | 0.479 | 0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.006 | -0.295 | 0.341 | 0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -0.961 | 0.789 | 0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.105 | 0.143 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.653 | 0.401 | 0.810 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.009 | -0.767 | 0.367 | 0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.499 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.056 | -0.185 | 0.147 | 0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.548 | 0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.002 | -0.231 | 0.177 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.001 | -0.578 | 0.609 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.350 | 0.216 | 0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.848 | 0.172 | 1.107 | 0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.168 | -1.123 | 0.330 | 0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.074 | -1.239 | 4.293 | 0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | -0.001 | -0.643 | 0.531 | 0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.220 | 0.376 | 0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.529 | 0.479 | 0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.230 | 0.295 | 0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.726 | 0.768 | 0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.167 | 0.193 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.695 | 0.334 | 0.833 | 0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.012 | -0.755 | 0.517 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.474 | 0.480 | 0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.049 | -0.218 | 0.148 | 0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.529 | 0.542 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.006 | -0.245 | 0.239 | 0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.541 | 0.485 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.318 | 0.170 | 0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.903 | 0.178 | 1.124 | 0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.138 | -1.223 | 0.440 | 0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.164 | -1.383 | 5.910 | 0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.526 | 0.496 | 0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.250 | 0.273 | 0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.447 | 0.524 | 0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.243 | 0.256 | 0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.551 | 0.730 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.145 | 0.126 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.707 | 0.319 | 0.855 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.013 | -0.839 | 0.507 | 0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.509 | 0.508 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.051 | -0.219 | 0.155 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.475 | 0.592 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.162 | 0.220 | 0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.465 | 0.528 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.243 | 0.286 | 0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.948 | 0.220 | 1.175 | 0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.125 | -1.093 | 0.385 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.150 | -1.632 | 4.522 | 0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.636 | 0.543 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.254 | 0.262 | 0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.632 | 0.628 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | -0.005 | -0.240 | 0.330 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.476 | 0.479 | 0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.112 | 0.134 | 0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.686 | 0.264 | 0.797 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.012 | -0.889 | 0.427 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.476 | 0.478 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.051 | -0.267 | 0.180 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.506 | 0.517 | 0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.172 | 0.241 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.570 | 0.542 | 0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.003 | -0.631 | 0.395 | 0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.912 | 0.189 | 1.122 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.114 | -1.125 | 0.188 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.099 | -1.285 | 1.708 | 0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.496 | 0.540 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | 0.003 | -0.260 | 0.228 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.511 | 0.454 | 0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.711 | 0.286 | 0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.444 | 0.454 | 0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.101 | 0.133 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.668 | 0.312 | 0.800 | 0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.015 | -0.778 | 0.372 | 0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.485 | 0.469 | 0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.045 | -0.294 | 0.173 | 0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.554 | 0.540 | 0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.183 | 0.199 | 0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.879 | 0.824 | 0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.670 | 0.358 | 0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.253 | 0.346 | 0.068 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.007 | -0.248 | 0.241 | 0.103 | torch.Size([120]) || stage2.linear1.bias + | 1.012 | 0.613 | 1.327 | 0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.019 | -0.724 | 0.685 | 0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.003 | -2.959 | 1.705 | 0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.636 | 0.617 | 0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | -0.002 | -0.291 | 0.292 | 0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | -0.002 | -0.476 | 0.512 | 0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.263 | 0.398 | 0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.677 | 0.521 | 0.840 | 0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.010 | -0.710 | 0.541 | 0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.540 | 0.507 | 0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | -0.016 | -0.242 | 0.201 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.519 | 0.479 | 0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.162 | 0.231 | 0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.449 | 0.494 | 0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.293 | 0.222 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.053 | 0.832 | 1.269 | 0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.015 | -0.549 | 0.428 | 0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.007 | -3.099 | 1.550 | 0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.673 | 0.604 | 0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.416 | 0.391 | 0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.569 | 0.560 | 0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | 0.004 | -0.613 | 0.428 | 0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.762 | 0.464 | 0.954 | 0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.005 | -0.745 | 0.381 | 0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.441 | 0.448 | 0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.019 | -0.292 | 0.460 | 0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.491 | 0.490 | 0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.285 | 0.177 | 0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.535 | 0.631 | 0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.011 | -0.765 | 0.337 | 0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.367 | 0.372 | 0.074 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.009 | -0.288 | 0.342 | 0.130 | torch.Size([120]) || stage2.linear2.bias + | 0.000 | -0.112 | 0.093 | 0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | -0.002 | -0.036 | 0.035 | 0.016 | torch.Size([120]) || stage2.pa_deform.bias + | 0.000 | -0.068 | 0.080 | 0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.009 | -0.035 | 0.023 | 0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | 0.000 | -0.068 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.014 | -0.061 | 0.036 | 0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.082 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.003 | -0.075 | 0.069 | 0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | -0.000 | -0.166 | 0.139 | 0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | -0.015 | -0.090 | 0.050 | 0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.002 | -0.642 | 0.663 | 0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.130 | -0.171 | 0.480 | 0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.696 | 0.620 | 0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.007 | -0.337 | 0.301 | 0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.650 | 0.657 | 0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.013 | -0.507 | 0.451 | 0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.067 | 0.372 | 1.778 | 0.269 | torch.Size([480]) || stage3.reshape.1.weight + | -0.004 | -0.699 | 0.521 | 0.227 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.643 | 0.743 | 0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.009 | -0.176 | 0.243 | 0.079 | torch.Size([120]) || stage3.reshape.2.bias + | 0.785 | 0.469 | 1.029 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.102 | -0.716 | 0.311 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.340 | 0.163 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.328 | 0.302 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | 0.004 | -0.232 | 0.189 | 0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.343 | 0.346 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | 0.004 | -0.335 | 0.229 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.366 | 0.325 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.074 | 0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.751 | 0.517 | 0.928 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.002 | -0.271 | 0.189 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.371 | 0.388 | 0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.073 | -0.203 | 0.039 | 0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.400 | 0.401 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.178 | 0.128 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.410 | 0.429 | 0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.006 | -0.345 | 0.304 | 0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.816 | 0.469 | 1.015 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.103 | -0.647 | 0.225 | 0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.464 | 0.239 | 0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.304 | 0.359 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.173 | 0.193 | 0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.299 | 0.408 | 0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.007 | -0.511 | 0.239 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.288 | 0.254 | 0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.060 | 0.054 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.796 | 0.609 | 0.971 | 0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | -0.002 | -0.327 | 0.247 | 0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.379 | 0.407 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.077 | -0.214 | 0.034 | 0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.391 | 0.432 | 0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.176 | 0.112 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.378 | 0.399 | 0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | 0.009 | -0.410 | 0.306 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.854 | 0.447 | 0.995 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.086 | -0.513 | 0.198 | 0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.001 | -0.189 | 0.292 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.390 | 0.367 | 0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.310 | 0.284 | 0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.334 | 0.296 | 0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.356 | 0.299 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.276 | 0.315 | 0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.094 | 0.066 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.829 | 0.673 | 1.017 | 0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.259 | 0.228 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.410 | 0.385 | 0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.085 | -0.200 | 0.017 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.348 | 0.378 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.130 | 0.105 | 0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.346 | 0.425 | 0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | 0.005 | -0.363 | 0.241 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.872 | 0.554 | 1.068 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.057 | -0.402 | 0.133 | 0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | 0.003 | -0.365 | 0.217 | 0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.359 | 0.357 | 0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.265 | 0.294 | 0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.300 | 0.271 | 0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.316 | 0.215 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.370 | 0.329 | 0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.056 | 0.066 | 0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.842 | 0.631 | 0.989 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | -0.001 | -0.216 | 0.263 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.388 | 0.391 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.087 | -0.202 | 0.032 | 0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.364 | 0.428 | 0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.137 | 0.106 | 0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.390 | 0.339 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.376 | 0.203 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.913 | 0.498 | 1.102 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.048 | -0.340 | 0.105 | 0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | 0.001 | -0.706 | 0.306 | 0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.373 | 0.339 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.301 | 0.301 | 0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.278 | 0.277 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | 0.003 | -0.310 | 0.240 | 0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.350 | 0.322 | 0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.064 | 0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.862 | 0.679 | 0.990 | 0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | -0.004 | -0.313 | 0.190 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.370 | 0.364 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.092 | -0.231 | 0.129 | 0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.375 | 0.511 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.114 | 0.114 | 0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.389 | 0.354 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | 0.005 | -0.258 | 0.164 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.899 | 0.480 | 1.089 | 0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.030 | -0.257 | 0.115 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | 0.003 | -0.462 | 0.290 | 0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.391 | 0.365 | 0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | -0.004 | -0.232 | 0.302 | 0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.267 | 0.293 | 0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.250 | 0.182 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.238 | 0.257 | 0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.032 | 0.033 | 0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.864 | 0.651 | 1.029 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.003 | -0.212 | 0.175 | 0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.378 | 0.379 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.097 | -0.308 | 0.026 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.578 | 0.401 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.166 | 0.131 | 0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.358 | 0.376 | 0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.262 | 0.176 | 0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.284 | 0.467 | 0.071 | torch.Size([120, 120]) || stage3.linear1.weight + | 0.006 | -0.201 | 0.269 | 0.090 | torch.Size([120]) || stage3.linear1.bias + | 0.877 | 0.568 | 1.197 | 0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.248 | 0.324 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.261 | 0.125 | 0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.563 | 0.552 | 0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.257 | 0.302 | 0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.390 | 0.385 | 0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.450 | 0.235 | 0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.986 | 0.755 | 1.165 | 0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.260 | 0.169 | 0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.355 | 0.397 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.046 | -0.220 | 0.086 | 0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.424 | 0.368 | 0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.111 | 0.122 | 0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.354 | 0.374 | 0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | 0.001 | -0.374 | 0.272 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.919 | 0.643 | 1.132 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.177 | 0.181 | 0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.332 | 0.131 | 0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.418 | 0.362 | 0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.375 | 0.347 | 0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | -0.001 | -0.294 | 0.354 | 0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | 0.003 | -0.432 | 0.259 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.012 | 0.750 | 1.178 | 0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.001 | -0.171 | 0.155 | 0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.331 | 0.356 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.035 | -0.207 | 0.197 | 0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.399 | 0.398 | 0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.111 | 0.129 | 0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.001 | -0.353 | 0.330 | 0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.328 | 0.127 | 0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.289 | 0.519 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.318 | 0.371 | 0.144 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.086 | 0.095 | 0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | -0.002 | -0.023 | 0.021 | 0.010 | torch.Size([120]) || stage3.pa_deform.bias + | -0.000 | -0.060 | 0.056 | 0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.008 | -0.035 | 0.019 | 0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.001 | -0.064 | 0.062 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.007 | -0.044 | 0.031 | 0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | 0.000 | -0.062 | 0.063 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | -0.006 | -0.052 | 0.043 | 0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | -0.081 | 0.080 | 0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | -0.004 | -0.087 | 0.083 | 0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | -0.002 | -0.465 | 0.513 | 0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.059 | -0.251 | 0.595 | 0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | -0.000 | -0.544 | 0.531 | 0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.001 | -0.589 | 0.433 | 0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | -0.000 | -0.535 | 0.562 | 0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.001 | -0.401 | 0.342 | 0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 0.997 | 0.921 | 1.125 | 0.028 | torch.Size([480]) || stage4.reshape.1.weight + | -0.000 | -0.058 | 0.059 | 0.022 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.155 | 0.150 | 0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight + | 0.001 | -0.016 | 0.016 | 0.006 | torch.Size([120]) || stage4.reshape.2.bias + | 1.002 | 0.999 | 1.009 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.071 | 0.066 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.093 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.009 | 0.009 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.080 | 0.097 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.035 | 0.027 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.080 | 0.079 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.087 | 0.092 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.080 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.031 | 0.029 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.002 | 0.997 | 1.007 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.066 | 0.065 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.006 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.080 | 0.083 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | -0.000 | -0.027 | 0.029 | 0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.077 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.006 | 0.009 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.080 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.077 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.084 | 0.075 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.034 | 0.031 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.996 | 1.008 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.001 | -0.070 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.007 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.080 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | -0.000 | -0.023 | 0.026 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.107 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.000 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.076 | 0.077 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.005 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -2.000 | 0.081 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.084 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.027 | 0.024 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.999 | 1.012 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.064 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.099 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.083 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | -0.000 | -0.019 | 0.018 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.079 | 0.084 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.087 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.079 | 0.082 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.002 | 0.998 | 1.011 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.001 | -0.004 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.089 | 0.081 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.080 | 0.085 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.075 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.000 | -0.021 | 0.016 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.082 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.086 | 0.080 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.076 | 0.081 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.018 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.003 | 0.997 | 1.014 | 0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.001 | -0.005 | 0.004 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.070 | 0.069 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.097 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.075 | 0.089 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.016 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.083 | 0.091 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.093 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.000 | -0.002 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.086 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.079 | 0.092 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | -0.000 | -0.012 | 0.016 | 0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.090 | 0.111 | 0.024 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.001 | -0.019 | 0.029 | 0.009 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 0.999 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.084 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.079 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | 0.000 | -0.021 | 0.024 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.072 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.077 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.102 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.024 | 0.020 | 0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.001 | 0.998 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.078 | 0.096 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.005 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.020 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.085 | 0.082 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.000 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.092 | 0.112 | 0.023 | torch.Size([120, 120]) || stage4.linear2.weight + | 0.000 | -0.032 | 0.049 | 0.015 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.036 | 0.037 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.022 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.002 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | -0.003 | 0.002 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | 0.000 | -0.172 | 0.177 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.002 | -0.027 | 0.088 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.212 | 0.163 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | 0.000 | -0.066 | 0.081 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | 0.000 | -0.413 | 0.387 | 0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.001 | -0.198 | 0.214 | 0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.979 | 0.896 | 1.076 | 0.053 | torch.Size([30]) || stage5.reshape.1.weight + | -0.005 | -0.074 | 0.100 | 0.043 | torch.Size([30]) || stage5.reshape.1.bias + | 0.000 | -0.240 | 0.249 | 0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight + | -0.002 | -0.286 | 0.229 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 1.001 | 0.993 | 1.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.004 | -0.018 | 0.006 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.066 | 0.062 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.086 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.014 | 0.012 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.166 | 0.172 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.053 | 0.045 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.999 | 0.987 | 1.001 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.094 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.022 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.082 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.014 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.075 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.073 | 0.078 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.001 | 0.994 | 1.007 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.004 | -0.016 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.065 | 0.063 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.077 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.022 | 0.017 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.113 | 0.098 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.058 | 0.045 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.080 | 0.080 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.999 | 0.982 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.006 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.076 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.017 | 0.014 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.080 | 0.086 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.014 | 0.016 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.096 | 0.079 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.051 | 0.039 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.998 | 1.009 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.004 | -0.014 | 0.003 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.067 | 0.073 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.085 | 0.087 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.015 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.108 | 0.095 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.043 | 0.039 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.999 | 0.978 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.076 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.012 | 0.019 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.079 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | -0.001 | -0.014 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.076 | 0.082 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.000 | -0.047 | 0.043 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.978 | 1.015 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.004 | -0.013 | 0.004 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.084 | 0.070 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.078 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.014 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.123 | 0.132 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.001 | -0.028 | 0.044 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.082 | 0.089 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.999 | 0.974 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.008 | 0.010 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.088 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.000 | -0.014 | 0.019 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.081 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.020 | 0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.081 | 0.106 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.046 | 0.042 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.944 | 1.017 | 0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.005 | -0.015 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.071 | 0.067 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.085 | 0.090 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.021 | 0.013 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.130 | 0.089 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.036 | 0.024 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.086 | 0.076 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.999 | 0.967 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.006 | 0.007 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.080 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | -0.001 | -0.015 | 0.010 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.081 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.020 | 0.018 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.081 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.037 | 0.050 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.004 | 0.976 | 1.039 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.005 | -0.015 | 0.005 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.070 | 0.076 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.099 | 0.097 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.011 | 0.012 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.084 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.038 | 0.035 | 0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.087 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.998 | 0.960 | 1.002 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.088 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | -0.000 | -0.014 | 0.027 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.081 | 0.074 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.013 | 0.025 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.100 | 0.086 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.000 | -0.022 | 0.030 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.102 | 0.117 | 0.023 | torch.Size([120, 120]) || stage5.linear1.weight + | -0.003 | -0.297 | 0.242 | 0.084 | torch.Size([120]) || stage5.linear1.bias + | 0.999 | 0.971 | 1.008 | 0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.000 | -0.035 | 0.034 | 0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.079 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.087 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.028 | 0.018 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.079 | 0.082 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.146 | 0.171 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.997 | 0.967 | 1.003 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.073 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.002 | -0.017 | 0.008 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.084 | 0.073 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.011 | 0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.103 | 0.140 | 0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.999 | 0.986 | 1.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.035 | 0.034 | 0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.087 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.084 | 0.079 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.024 | 0.024 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.078 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.112 | 0.144 | 0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.998 | 0.965 | 1.004 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.001 | -0.012 | 0.015 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.102 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.012 | 0.009 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.075 | 0.078 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.105 | 0.131 | 0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.220 | 0.209 | 0.035 | torch.Size([120, 120]) || stage5.linear2.weight + | -0.003 | -0.335 | 0.284 | 0.096 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.064 | 0.065 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.001 | -0.050 | 0.050 | 0.029 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.119 | 0.106 | 0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.006 | -0.030 | 0.026 | 0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.055 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | 0.001 | -0.033 | 0.031 | 0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | 0.001 | -0.060 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.040 | 0.037 | 0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.001 | -0.038 | 0.051 | 0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | -0.048 | 0.050 | 0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.334 | 0.340 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.037 | -0.050 | 0.294 | 0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | -0.000 | -0.343 | 0.349 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | -0.001 | -0.237 | 0.244 | 0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | -0.000 | -0.575 | 0.591 | 0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.001 | -0.404 | 0.344 | 0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.254 | 1.058 | 1.466 | 0.126 | torch.Size([30]) || stage6.reshape.1.weight + | -0.001 | -0.074 | 0.093 | 0.041 | torch.Size([30]) || stage6.reshape.1.bias + | 0.000 | -0.734 | 0.625 | 0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.003 | -0.269 | 0.341 | 0.108 | torch.Size([120]) || stage6.reshape.2.bias + | 0.815 | 0.495 | 1.118 | 0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.071 | -0.291 | 0.263 | 0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.080 | 0.087 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.136 | 0.134 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.061 | 0.037 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.201 | 0.182 | 0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.223 | 0.189 | 0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.184 | 0.211 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.049 | 0.069 | 0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.710 | 0.556 | 0.893 | 0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.172 | 0.193 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.217 | 0.211 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.041 | -0.158 | 0.025 | 0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.209 | 0.178 | 0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.141 | 0.186 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.245 | 0.347 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.005 | -0.161 | 0.188 | 0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.780 | 0.582 | 0.963 | 0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.112 | -0.302 | 0.103 | 0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.101 | 0.072 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.112 | 0.178 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.034 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.223 | 0.242 | 0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.149 | 0.105 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.199 | 0.173 | 0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.035 | 0.056 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.744 | 0.530 | 0.917 | 0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.131 | 0.180 | 0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.243 | 0.294 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.039 | -0.217 | 0.045 | 0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.206 | 0.178 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.129 | 0.125 | 0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.236 | 0.276 | 0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.158 | 0.170 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.829 | 0.586 | 1.007 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.101 | -0.353 | 0.132 | 0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.082 | 0.076 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.154 | 0.143 | 0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.041 | 0.038 | 0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.187 | 0.202 | 0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.096 | 0.127 | 0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.203 | 0.185 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.768 | 0.491 | 0.904 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.001 | -0.146 | 0.159 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.184 | 0.204 | 0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.043 | -0.185 | 0.020 | 0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.188 | 0.270 | 0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.152 | 0.134 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.222 | 0.217 | 0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.141 | 0.144 | 0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.820 | 0.554 | 0.976 | 0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.091 | -0.336 | 0.137 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.124 | 0.222 | 0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.157 | 0.175 | 0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.049 | 0.049 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.238 | 0.236 | 0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.077 | 0.074 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.212 | 0.265 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.028 | 0.052 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.768 | 0.530 | 0.903 | 0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.104 | 0.157 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.197 | 0.220 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.042 | -0.155 | 0.043 | 0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.166 | 0.199 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.102 | 0.138 | 0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.241 | 0.256 | 0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.123 | 0.115 | 0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.817 | 0.631 | 0.918 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.295 | 0.141 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.084 | 0.205 | 0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.174 | 0.199 | 0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.060 | 0.081 | 0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.194 | 0.191 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.001 | -0.083 | 0.077 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.218 | 0.243 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.031 | 0.024 | 0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.744 | 0.478 | 0.913 | 0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.003 | -0.146 | 0.110 | 0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.223 | 0.238 | 0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.046 | -0.200 | 0.071 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.168 | 0.201 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.128 | 0.141 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.220 | 0.205 | 0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.086 | 0.094 | 0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.754 | 0.353 | 0.933 | 0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.246 | 0.105 | 0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.113 | 0.536 | 0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.261 | 0.224 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.050 | 0.067 | 0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.234 | 0.256 | 0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | 0.002 | -0.079 | 0.076 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.211 | 0.231 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.033 | 0.030 | 0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.677 | 0.275 | 0.833 | 0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.001 | -0.224 | 0.306 | 0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.196 | 0.211 | 0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.061 | -0.289 | 0.136 | 0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.271 | 0.312 | 0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.166 | 0.155 | 0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.286 | 0.375 | 0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.054 | 0.137 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.174 | 0.172 | 0.039 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.002 | -0.275 | 0.348 | 0.113 | torch.Size([120]) || stage6.linear1.bias + | 0.704 | 0.402 | 1.002 | 0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.001 | -0.466 | 0.407 | 0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.172 | 0.570 | 0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.337 | 0.378 | 0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.071 | 0.068 | 0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.290 | 0.321 | 0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | 0.001 | -0.255 | 0.250 | 0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.695 | 0.353 | 0.966 | 0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | -0.001 | -0.218 | 0.165 | 0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.259 | 0.255 | 0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.044 | -0.256 | 0.042 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.234 | 0.214 | 0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.002 | -0.133 | 0.091 | 0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.333 | 0.296 | 0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | 0.003 | -0.238 | 0.280 | 0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.671 | 0.425 | 0.980 | 0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.261 | 0.305 | 0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.372 | 0.942 | 0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.450 | 0.494 | 0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.133 | 0.119 | 0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.239 | 0.288 | 0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.187 | 0.157 | 0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.687 | 0.160 | 0.907 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | -0.002 | -0.192 | 0.222 | 0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.257 | 0.426 | 0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.064 | -0.207 | 0.036 | 0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.269 | 0.224 | 0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.126 | 0.129 | 0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.308 | 0.298 | 0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.180 | 0.192 | 0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.297 | 0.368 | 0.069 | torch.Size([120, 120]) || stage6.linear2.weight + | 0.001 | -0.431 | 0.480 | 0.189 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.100 | 0.104 | 0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.001 | -0.018 | 0.029 | 0.010 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.105 | 0.111 | 0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.033 | 0.024 | 0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.071 | 0.067 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.003 | -0.061 | 0.043 | 0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.000 | -0.074 | 0.068 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.001 | -0.075 | 0.056 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.001 | -0.124 | 0.108 | 0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | -0.001 | -0.113 | 0.076 | 0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.001 | -0.517 | 0.524 | 0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.154 | -0.305 | 0.679 | 0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | 0.000 | -0.680 | 0.728 | 0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.020 | -0.514 | 0.417 | 0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.587 | 0.737 | 0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.015 | -0.437 | 0.490 | 0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.284 | 1.119 | 1.404 | 0.055 | torch.Size([30]) || stage7.reshape.1.weight + | -0.014 | -0.286 | 0.184 | 0.122 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.521 | 0.576 | 0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight + | 0.004 | -0.387 | 0.738 | 0.175 | torch.Size([120]) || stage7.reshape.2.bias + | 0.440 | 0.099 | 0.775 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.177 | -0.670 | 0.319 | 0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.055 | -2.159 | 1.979 | 0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.535 | 0.554 | 0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.193 | 0.281 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.397 | 0.395 | 0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.232 | 0.692 | 0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.899 | 1.073 | 0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.122 | 0.104 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.310 | 0.157 | 0.440 | 0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.006 | -0.474 | 0.266 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.605 | 0.490 | 0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.101 | -0.310 | 0.126 | 0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.448 | 0.475 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | 0.006 | -0.185 | 0.215 | 0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.465 | 0.512 | 0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.150 | 0.417 | 0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.577 | 0.165 | 0.829 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.136 | -0.849 | 0.206 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.143 | -3.020 | 4.621 | 0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.647 | 0.640 | 0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.356 | 0.382 | 0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.457 | 0.378 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.250 | 0.707 | 0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.055 | 1.091 | 0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.093 | 0.123 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.411 | 0.265 | 0.535 | 0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.008 | -0.630 | 0.264 | 0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.501 | 0.506 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.087 | -0.341 | 0.140 | 0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.450 | 0.527 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.188 | 0.171 | 0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.554 | 0.546 | 0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.000 | -0.135 | 0.220 | 0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.655 | 0.134 | 0.896 | 0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.139 | -0.788 | 0.181 | 0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.062 | -3.469 | 3.276 | 0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.592 | 0.650 | 0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.308 | 0.218 | 0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.355 | 0.345 | 0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.213 | 0.700 | 0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -1.166 | 0.942 | 0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.093 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.466 | 0.317 | 0.565 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.014 | -0.657 | 0.280 | 0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.541 | 0.494 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.079 | -0.335 | 0.122 | 0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.513 | 0.493 | 0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.180 | 0.175 | 0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.509 | 0.479 | 0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.004 | -0.093 | 0.293 | 0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.693 | 0.147 | 0.945 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.132 | -0.906 | 0.249 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.108 | -3.576 | 4.241 | 0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.945 | 1.095 | 0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.274 | 0.204 | 0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | -0.001 | -0.379 | 0.351 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.211 | 0.587 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -1.269 | 1.067 | 0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.117 | 0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.499 | 0.285 | 0.570 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.012 | -0.567 | 0.273 | 0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.528 | 0.499 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.084 | -0.349 | 0.141 | 0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.547 | 0.592 | 0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | 0.002 | -0.154 | 0.176 | 0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.520 | 0.480 | 0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.001 | -0.150 | 0.207 | 0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.726 | 0.137 | 1.004 | 0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.122 | -0.907 | 0.180 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.078 | -3.824 | 4.241 | 0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -1.188 | 0.796 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.248 | 0.207 | 0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | -0.001 | -0.409 | 0.369 | 0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.224 | 0.322 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -1.744 | 1.273 | 0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.113 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.514 | 0.277 | 0.614 | 0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.016 | -0.621 | 0.286 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.517 | 0.453 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.064 | -0.260 | 0.143 | 0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.503 | 0.554 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.004 | -0.232 | 0.193 | 0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.595 | 0.543 | 0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.196 | 0.198 | 0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.731 | 0.152 | 1.075 | 0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.076 | -1.003 | 0.176 | 0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.121 | -3.281 | 4.671 | 0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.640 | 1.083 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.239 | 0.314 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.344 | 0.452 | 0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.361 | 0.251 | 0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.637 | 0.806 | 0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.088 | 0.091 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.514 | 0.238 | 0.594 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.017 | -0.650 | 0.162 | 0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.442 | 0.479 | 0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.040 | -0.400 | 0.203 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.541 | 0.514 | 0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | -0.008 | -0.319 | 0.309 | 0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -1.018 | 1.398 | 0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.606 | 0.269 | 0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.186 | 0.207 | 0.048 | torch.Size([120, 120]) || stage7.linear1.weight + | 0.010 | -0.448 | 0.437 | 0.161 | torch.Size([120]) || stage7.linear1.bias + | 0.703 | 0.381 | 0.856 | 0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.014 | -0.645 | 0.486 | 0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.007 | -4.468 | 1.008 | 0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.625 | 0.834 | 0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | -0.009 | -0.737 | 0.632 | 0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.403 | 0.406 | 0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.338 | 0.165 | 0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.435 | 0.323 | 0.526 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.678 | 0.379 | 0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.465 | 0.467 | 0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | -0.031 | -0.236 | 0.180 | 0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.490 | 0.520 | 0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.197 | 0.242 | 0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.525 | 0.501 | 0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.005 | -0.431 | 0.164 | 0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.703 | 0.306 | 0.866 | 0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.009 | -0.647 | 0.481 | 0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.010 | -3.504 | 1.842 | 0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.639 | 0.590 | 0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.613 | 0.609 | 0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.316 | 0.325 | 0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | -0.004 | -0.350 | 0.145 | 0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.452 | 0.309 | 0.558 | 0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.003 | -0.661 | 0.246 | 0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.580 | 0.410 | 0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | -0.020 | -0.258 | 0.299 | 0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.561 | 0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.234 | 0.434 | 0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.778 | 0.581 | 0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.888 | 0.286 | 0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.348 | 0.237 | 0.060 | torch.Size([120, 120]) || stage7.linear2.weight + | 0.023 | -0.390 | 0.506 | 0.167 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.104 | 0.107 | 0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.002 | -0.041 | 0.035 | 0.016 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.123 | 0.109 | 0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.002 | -0.034 | 0.032 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.111 | 0.084 | 0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.008 | -0.073 | 0.081 | 0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.154 | 0.122 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.014 | -0.041 | 0.068 | 0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | -0.001 | -0.408 | 0.365 | 0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | -0.003 | -0.057 | 0.054 | 0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | 0.000 | -0.697 | 0.606 | 0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.119 | -0.211 | 0.720 | 0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.000 | -1.175 | 0.924 | 0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.000 | -0.581 | 0.580 | 0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.001 | -0.786 | 0.874 | 0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | -0.053 | -0.522 | 0.577 | 0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.225 | 1.000 | 1.516 | 0.095 | torch.Size([120]) || stage8.0.1.weight + | -0.013 | -0.413 | 0.465 | 0.139 | torch.Size([120]) || stage8.0.1.bias + | 0.000 | -2.505 | 0.627 | 0.136 | torch.Size([180, 120]) || stage8.0.2.weight + | 0.005 | -0.397 | 0.377 | 0.107 | torch.Size([180]) || stage8.0.2.bias + | 0.456 | 0.123 | 0.760 | 0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.022 | -0.343 | 0.875 | 0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.014 | -1.907 | 2.592 | 0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.632 | 0.628 | 0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | 0.006 | -0.567 | 0.668 | 0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.477 | 0.447 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | -0.010 | -0.460 | 0.225 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.429 | 0.119 | 0.634 | 0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.007 | -0.338 | 0.803 | 0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | -0.006 | -0.572 | 0.539 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.060 | -0.260 | 0.185 | 0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.461 | 0.548 | 0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.163 | 0.183 | 0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.757 | 0.581 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.191 | 0.121 | 0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.557 | 0.086 | 0.800 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.029 | -0.230 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | -0.016 | -2.004 | 1.711 | 0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.690 | 0.575 | 0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | 0.011 | -0.641 | 0.609 | 0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.466 | 0.401 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.344 | 0.181 | 0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.503 | 0.226 | 0.742 | 0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.009 | -0.404 | 0.818 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.007 | -0.595 | 0.532 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.068 | -0.261 | 0.071 | 0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.573 | 0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.129 | 0.197 | 0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.556 | 0.582 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | -0.003 | -0.170 | 0.145 | 0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.699 | 0.202 | 0.912 | 0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.033 | -0.253 | 0.924 | 0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.030 | -2.510 | 2.088 | 0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.637 | 0.801 | 0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.006 | -0.512 | 0.520 | 0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.381 | 0.337 | 0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | -0.011 | -0.238 | 0.234 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.594 | 0.150 | 0.810 | 0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.010 | -0.483 | 0.726 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.006 | -0.567 | 0.499 | 0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.077 | -0.360 | 0.050 | 0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.536 | 0.673 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.142 | 0.186 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.536 | 0.524 | 0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | -0.006 | -0.147 | 0.133 | 0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.683 | 0.141 | 0.908 | 0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.033 | -0.199 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | -0.039 | -1.527 | 3.891 | 0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.682 | 0.693 | 0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | 0.007 | -0.543 | 0.513 | 0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.390 | 0.476 | 0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.007 | -0.176 | 0.150 | 0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.640 | 0.094 | 0.853 | 0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.009 | -0.372 | 0.683 | 0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.006 | -0.628 | 0.521 | 0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.089 | -0.367 | 0.047 | 0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.629 | 0.562 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.186 | 0.128 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.485 | 0.499 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.138 | 0.209 | 0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.294 | 0.577 | 0.071 | torch.Size([180, 180]) || stage8.1.linear.weight + | 0.004 | -0.349 | 0.235 | 0.072 | torch.Size([180]) || stage8.1.linear.bias + | 0.708 | 0.242 | 1.026 | 0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.032 | -0.212 | 0.830 | 0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | -0.039 | -1.954 | 2.394 | 0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.922 | 0.646 | 0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.429 | 0.524 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.467 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | -0.005 | -0.339 | 0.264 | 0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.587 | 0.255 | 0.837 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.011 | -0.285 | 0.721 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.006 | -0.586 | 0.534 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.075 | -0.225 | 0.066 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.493 | 0.532 | 0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.189 | 0.178 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.551 | 0.543 | 0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.010 | -0.154 | 0.142 | 0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.773 | 0.210 | 1.004 | 0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.035 | -0.176 | 0.873 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.027 | -2.407 | 1.736 | 0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.817 | 0.977 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.659 | 0.461 | 0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.484 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.014 | -0.315 | 0.252 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.641 | 0.337 | 0.810 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.011 | -0.177 | 0.806 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | -0.006 | -0.569 | 0.598 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.079 | -0.323 | 0.071 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.512 | 0.577 | 0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.003 | -0.142 | 0.161 | 0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.529 | 0.572 | 0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | -0.010 | -0.178 | 0.159 | 0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.857 | 0.199 | 1.153 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.039 | -0.189 | 0.943 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.042 | -1.962 | 2.773 | 0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.783 | 0.655 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.004 | -0.338 | 0.533 | 0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.497 | 0.461 | 0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.288 | 0.183 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.681 | 0.327 | 0.878 | 0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.012 | -0.178 | 0.773 | 0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.006 | -0.789 | 0.546 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.081 | -0.249 | 0.036 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.526 | 0.555 | 0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.000 | -0.133 | 0.191 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.572 | 0.529 | 0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.164 | 0.147 | 0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.877 | 0.198 | 1.043 | 0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.038 | -0.210 | 0.916 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.094 | -2.974 | 4.987 | 0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.964 | 1.011 | 0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.404 | 0.429 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.501 | 0.489 | 0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | -0.021 | -0.305 | 0.208 | 0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.697 | 0.295 | 0.894 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.015 | -0.241 | 0.712 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | -0.005 | -0.562 | 0.573 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.085 | -0.302 | 0.080 | 0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.734 | 0.573 | 0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.150 | 0.161 | 0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.671 | 0.623 | 0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | -0.023 | -0.252 | 0.317 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.278 | 0.345 | 0.064 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.004 | -0.315 | 0.148 | 0.064 | torch.Size([180]) || stage8.2.linear.bias + | 0.850 | 0.326 | 1.087 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.031 | -0.334 | 0.779 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.012 | -2.917 | 1.476 | 0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.603 | 0.666 | 0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.374 | 0.381 | 0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.577 | 0.605 | 0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | -0.008 | -0.394 | 0.499 | 0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.636 | 0.321 | 0.790 | 0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.013 | -0.294 | 0.774 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.004 | -0.540 | 0.539 | 0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.065 | -0.212 | 0.047 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.608 | 0.603 | 0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.002 | -0.177 | 0.155 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.573 | 0.630 | 0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.005 | -0.189 | 0.178 | 0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.899 | 0.275 | 1.048 | 0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.031 | -0.223 | 0.771 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.003 | -3.151 | 1.718 | 0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.732 | 0.868 | 0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.412 | 0.350 | 0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.466 | 0.487 | 0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | -0.006 | -0.388 | 0.400 | 0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.711 | 0.381 | 0.864 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.009 | -0.240 | 0.692 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.005 | -0.657 | 0.639 | 0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.077 | -0.263 | 0.047 | 0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.673 | 0.605 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.158 | 0.155 | 0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.582 | 0.585 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.009 | -0.253 | 0.178 | 0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 0.941 | 0.262 | 1.154 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.032 | -0.162 | 0.906 | 0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.005 | -3.421 | 1.350 | 0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.777 | 0.735 | 0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.355 | 0.421 | 0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.479 | 0.475 | 0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | -0.013 | -0.292 | 0.345 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.743 | 0.242 | 0.919 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.011 | -0.214 | 0.691 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.005 | -0.633 | 0.498 | 0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.082 | -0.346 | 0.087 | 0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.591 | 0.670 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.190 | 0.151 | 0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.560 | 0.637 | 0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | -0.009 | -0.226 | 0.250 | 0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 0.950 | 0.250 | 1.103 | 0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.035 | -0.196 | 0.925 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | -0.026 | -3.591 | 5.653 | 0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.753 | 0.637 | 0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | 0.000 | -0.333 | 0.432 | 0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | 0.001 | -0.591 | 0.591 | 0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | -0.014 | -0.348 | 0.267 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.735 | 0.254 | 0.893 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.011 | -0.241 | 0.659 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.005 | -0.628 | 0.667 | 0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.076 | -0.411 | 0.113 | 0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.662 | 0.578 | 0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.208 | 0.169 | 0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.602 | 0.588 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | -0.011 | -0.218 | 0.232 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.343 | 0.316 | 0.065 | torch.Size([180, 180]) || stage8.3.linear.weight + | 0.010 | -0.297 | 0.187 | 0.061 | torch.Size([180]) || stage8.3.linear.bias + | 1.012 | 0.330 | 1.282 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.030 | -0.347 | 0.800 | 0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.013 | -2.816 | 3.792 | 0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.807 | 0.825 | 0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.429 | 0.319 | 0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.553 | 0.569 | 0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.019 | -0.443 | 0.441 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.638 | 0.420 | 0.797 | 0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.018 | -0.222 | 0.886 | 0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.576 | 0.510 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.018 | -0.277 | 0.123 | 0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.687 | 0.625 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | -0.007 | -0.264 | 0.267 | 0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | 0.001 | -0.639 | 0.705 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.012 | -0.255 | 0.274 | 0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.092 | 0.475 | 1.341 | 0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.030 | -0.294 | 0.686 | 0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.018 | -3.165 | 0.990 | 0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.695 | 0.699 | 0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.319 | 0.286 | 0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.542 | 0.519 | 0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | -0.017 | -0.439 | 0.451 | 0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.664 | 0.366 | 0.835 | 0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.015 | -0.217 | 0.985 | 0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.641 | 0.563 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.022 | -0.381 | 0.161 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.571 | 0.642 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | 0.003 | -0.279 | 0.311 | 0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.738 | 0.633 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.254 | 0.261 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.125 | 0.525 | 1.405 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.033 | -0.186 | 0.627 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | 0.028 | -3.477 | 0.957 | 0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.663 | 0.658 | 0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.007 | -0.357 | 0.255 | 0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.596 | 0.578 | 0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.018 | -0.506 | 0.389 | 0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.694 | 0.319 | 0.865 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.018 | -0.150 | 0.975 | 0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.002 | -0.619 | 0.565 | 0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.025 | -0.345 | 0.208 | 0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.624 | 0.607 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.388 | 0.290 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.927 | 0.675 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.325 | 0.240 | 0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.108 | 0.535 | 1.297 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.035 | -0.213 | 0.546 | 0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | 0.020 | -3.042 | 1.420 | 0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.697 | 0.700 | 0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | -0.000 | -0.220 | 0.311 | 0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.652 | 0.592 | 0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | -0.019 | -0.535 | 0.426 | 0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.685 | 0.225 | 0.893 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.023 | -0.211 | 0.938 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.501 | 0.564 | 0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | -0.014 | -0.339 | 0.237 | 0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.560 | 0.626 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.000 | -0.231 | 0.239 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.544 | 0.657 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.271 | 0.274 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.473 | 0.481 | 0.069 | torch.Size([180, 180]) || stage8.4.linear.weight + | 0.029 | -0.333 | 0.194 | 0.076 | torch.Size([180]) || stage8.4.linear.bias + | 1.025 | 0.297 | 1.336 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.034 | -0.429 | 0.872 | 0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.574 | -4.515 | 3.381 | 0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.771 | 0.886 | 0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | 0.000 | -0.356 | 0.521 | 0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | -0.001 | -0.632 | 0.656 | 0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.029 | -0.329 | 0.697 | 0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.777 | 0.446 | 0.952 | 0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.022 | -0.335 | 0.920 | 0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.002 | -0.520 | 0.598 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.013 | -0.456 | 0.200 | 0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.677 | 0.642 | 0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.005 | -0.272 | 0.233 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.762 | 0.598 | 0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.025 | -0.244 | 0.583 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.021 | 0.261 | 1.261 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.033 | -0.358 | 0.867 | 0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.550 | -3.274 | 4.406 | 0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.819 | 0.986 | 0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.510 | 0.446 | 0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | -0.003 | -0.739 | 0.682 | 0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.032 | -0.318 | 0.607 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 0.823 | 0.420 | 0.950 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.021 | -0.274 | 0.882 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | -0.002 | -0.496 | 0.532 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | -0.028 | -0.260 | 0.194 | 0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.620 | 0.586 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.284 | 0.423 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.774 | 0.614 | 0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.028 | -0.371 | 0.561 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.096 | 0.377 | 1.321 | 0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.033 | -0.244 | 0.755 | 0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.441 | -3.439 | 5.870 | 0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.710 | 0.679 | 0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | 0.003 | -0.277 | 0.283 | 0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.824 | 0.684 | 0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.033 | -0.390 | 0.545 | 0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 0.843 | 0.390 | 0.984 | 0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.022 | -0.211 | 0.854 | 0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.002 | -0.522 | 0.503 | 0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | -0.024 | -0.243 | 0.219 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | -0.001 | -0.638 | 0.617 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.268 | 0.380 | 0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.713 | 0.769 | 0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.034 | -0.372 | 0.592 | 0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.027 | 0.318 | 1.206 | 0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.033 | -0.187 | 0.768 | 0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.347 | -2.664 | 2.684 | 0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.677 | 0.676 | 0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.002 | -0.410 | 0.354 | 0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.630 | 0.725 | 0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.041 | -0.385 | 0.660 | 0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 0.849 | 0.390 | 0.985 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.023 | -0.163 | 0.810 | 0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | -0.002 | -0.547 | 0.536 | 0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | -0.012 | -0.366 | 0.252 | 0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.669 | 0.597 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.216 | 0.202 | 0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.700 | 0.674 | 0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.032 | -0.376 | 0.666 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.299 | 0.469 | 0.069 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.081 | -0.562 | 0.263 | 0.109 | torch.Size([180]) || stage8.5.linear.bias + | 1.111 | 0.208 | 1.434 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.048 | -0.547 | 0.851 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.252 | -2.157 | 6.293 | 0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.664 | 0.631 | 0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | 0.007 | -0.293 | 0.366 | 0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.701 | 0.726 | 0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | 0.030 | -0.318 | 0.331 | 0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 0.959 | 0.475 | 1.322 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.039 | -0.421 | 0.873 | 0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.002 | -0.550 | 0.783 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.269 | 0.152 | 0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.914 | 0.839 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.340 | 0.304 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.592 | 0.713 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.002 | -0.535 | 0.384 | 0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.123 | 0.183 | 1.352 | 0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.047 | -0.513 | 0.903 | 0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.234 | -1.968 | 6.366 | 0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.751 | 0.759 | 0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.300 | 0.214 | 0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.657 | 0.699 | 0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | 0.031 | -0.321 | 0.293 | 0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 0.986 | 0.416 | 1.360 | 0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.038 | -0.393 | 0.807 | 0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.001 | -0.589 | 0.620 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.005 | -0.316 | 0.229 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.738 | 0.766 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.252 | 0.302 | 0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.674 | 0.629 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.475 | 0.441 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.097 | 0.342 | 1.294 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.054 | -0.639 | 0.904 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.135 | -3.252 | 1.238 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.672 | 0.663 | 0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.007 | -0.170 | 0.228 | 0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.660 | 0.651 | 0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.031 | -0.360 | 0.322 | 0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.004 | 0.360 | 1.381 | 0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.042 | -0.447 | 0.808 | 0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.000 | -0.600 | 0.603 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.022 | -0.447 | 0.249 | 0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.666 | 0.708 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | -0.002 | -0.326 | 0.272 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | -0.001 | -0.653 | 0.719 | 0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.488 | 0.321 | 0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.095 | 0.272 | 1.302 | 0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.052 | -0.557 | 1.069 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.196 | -2.349 | 1.401 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.741 | 0.657 | 0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.186 | 0.141 | 0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.669 | 0.671 | 0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.004 | -0.323 | 0.300 | 0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 0.999 | 0.383 | 1.380 | 0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.044 | -0.392 | 0.694 | 0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | 0.000 | -0.577 | 0.857 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.041 | -0.394 | 0.238 | 0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.924 | 0.828 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.214 | 0.407 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.827 | 0.755 | 0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.022 | -0.296 | 0.262 | 0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | 0.002 | -1.059 | 1.262 | 0.089 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.031 | -0.789 | 0.427 | 0.120 | torch.Size([180]) || stage8.6.linear.bias + | 0.389 | 0.079 | 1.137 | 0.176 | torch.Size([180]) || norm.weight + | -0.021 | -0.669 | 0.888 | 0.127 | torch.Size([180]) || norm.bias + | 0.000 | -0.486 | 0.568 | 0.103 | torch.Size([120, 180]) || conv_after_body.weight + | -0.000 | -0.167 | 0.168 | 0.055 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -1.782 | 1.300 | 0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.019 | -0.542 | 0.437 | 0.162 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.001 | -1.915 | 1.372 | 0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.045 | -0.281 | 0.215 | 0.097 | torch.Size([256]) || upsample.0.bias + | -0.006 | -4.826 | 0.582 | 0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.154 | -0.441 | 0.187 | 0.100 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.210 | 0.246 | 0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.000 | -0.013 | 0.007 | 0.003 | torch.Size([64]) || upsample.10.bias + | 0.000 | -0.044 | 0.042 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:53:04.972 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/train/train_sharp + dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4 + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 4 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [6, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:53:05.016 : Number of train images: 24,000, iters: 3,000 +22-03-11 10:53:19.424 : +Networks name: VRT +Params number: 30676435 +Net structure: +VRT( + (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (spynet): SpyNet( + (basic_module): ModuleList( + (0): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (1): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (2): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (3): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (4): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + (5): BasicModule( + (basic_module): Sequential( + (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (1): ReLU() + (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (3): ReLU() + (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (5): ReLU() + (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + (7): ReLU() + (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3)) + ) + ) + ) + ) + (stage1): Stage( + (reshape): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): Identity() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage2): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage3): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage4): Stage( + (reshape): Sequential( + (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2) + (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=480, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage5): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage6): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage7): Stage( + (reshape): Sequential( + (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2) + (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=30, out_features=120, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (residual_group1): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (4): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (5): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=240, out_features=120, bias=True) + (qkv_mut): Linear(in_features=120, out_features=360, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear1): Linear(in_features=120, out_features=120, bias=True) + (residual_group2): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=120, out_features=360, bias=True) + (proj): Linear(in_features=120, out_features=120, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=120, out_features=240, bias=True) + (fc12): Linear(in_features=120, out_features=240, bias=True) + (act): GELU() + (fc2): Linear(in_features=240, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear2): Linear(in_features=120, out_features=120, bias=True) + (pa_deform): DCNv2PackFlowGuided( + (conv_offset): Sequential( + (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): LeakyReLU(negative_slope=0.1, inplace=True) + (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (3): LeakyReLU(negative_slope=0.1, inplace=True) + (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (5): LeakyReLU(negative_slope=0.1, inplace=True) + (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (pa_fuse): Mlp_GEGLU( + (fc11): Linear(in_features=360, out_features=360, bias=True) + (fc12): Linear(in_features=360, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=120, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (stage8): ModuleList( + (0): Sequential( + (0): Rearrange('n c d h w -> n d h w c') + (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True) + (2): Linear(in_features=120, out_features=180, bias=True) + (3): Rearrange('n d h w c -> n c d h w') + ) + (1): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (2): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (3): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (4): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (5): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + (6): RTMSA( + (residual_group): TMSAG( + (blocks): ModuleList( + (0): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (1): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (2): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + (3): TMSA( + (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (attn): WindowAttention( + (qkv_self): Linear(in_features=180, out_features=540, bias=True) + (proj): Linear(in_features=180, out_features=180, bias=True) + (softmax): Softmax(dim=-1) + ) + (drop_path): DropPath() + (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (mlp): Mlp_GEGLU( + (fc11): Linear(in_features=180, out_features=360, bias=True) + (fc12): Linear(in_features=180, out_features=360, bias=True) + (act): GELU() + (fc2): Linear(in_features=360, out_features=180, bias=True) + (drop): Dropout(p=0.0, inplace=False) + ) + ) + ) + ) + (linear): Linear(in_features=180, out_features=180, bias=True) + ) + ) + (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True) + (conv_after_body): Linear(in_features=180, out_features=120, bias=True) + (conv_before_upsample): Sequential( + (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): LeakyReLU(negative_slope=0.01, inplace=True) + ) + (upsample): Upsample( + (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (1): Transpose_Dim12() + (2): PixelShuffle(upscale_factor=2) + (3): Transpose_Dim12() + (4): LeakyReLU(negative_slope=0.1, inplace=True) + (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + (6): Transpose_Dim12() + (7): PixelShuffle(upscale_factor=2) + (8): Transpose_Dim12() + (9): LeakyReLU(negative_slope=0.1, inplace=True) + (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) + ) + (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1)) +) + +22-03-11 10:53:19.603 : + | mean | min | max | std || shape + | -0.000 | -1.462 | 1.580 | 0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight + | 0.005 | -0.950 | 0.885 | 0.268 | torch.Size([120]) || conv_first.bias + | 0.449 | 0.406 | 0.485 | 0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean + | 0.226 | 0.224 | 0.229 | 0.003 | torch.Size([1, 3, 1, 1]) || spynet.std + | -0.000 | -0.679 | 0.720 | 0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight + | -0.042 | -0.894 | 0.351 | 0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias + | -0.008 | -3.201 | 0.948 | 0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight + | 0.059 | -1.268 | 0.732 | 0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias + | -0.010 | -4.633 | 0.568 | 0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight + | 0.159 | -0.704 | 0.859 | 0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias + | -0.024 | -1.714 | 0.414 | 0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight + | 0.780 | -1.061 | 1.162 | 0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias + | 0.000 | -0.144 | 0.163 | 0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight + | 0.001 | -0.003 | 0.005 | 0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias + | 0.000 | -0.726 | 0.773 | 0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight + | -0.021 | -0.814 | 0.355 | 0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias + | -0.010 | -3.380 | 0.916 | 0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight + | 0.038 | -1.207 | 0.714 | 0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias + | -0.008 | -4.462 | 0.549 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight + | 0.157 | -0.742 | 0.980 | 0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias + | -0.020 | -1.648 | 0.319 | 0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight + | 0.775 | -1.195 | 1.148 | 0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias + | -0.000 | -0.122 | 0.152 | 0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight + | -0.000 | -0.002 | 0.001 | 0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias + | 0.000 | -0.956 | 0.870 | 0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight + | -0.025 | -1.040 | 0.512 | 0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias + | -0.011 | -4.624 | 1.195 | 0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight + | 0.023 | -1.284 | 0.699 | 0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias + | -0.009 | -1.831 | 0.616 | 0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight + | 0.120 | -0.695 | 0.755 | 0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias + | -0.013 | -1.285 | 0.304 | 0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight + | 0.681 | -1.725 | 0.942 | 0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias + | 0.000 | -0.045 | 0.071 | 0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight + | -0.010 | -0.010 | -0.009 | 0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias + | -0.000 | -0.995 | 0.879 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight + | -0.040 | -1.137 | 0.617 | 0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias + | -0.010 | -4.891 | 1.224 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight + | 0.022 | -1.287 | 0.745 | 0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias + | -0.010 | -1.802 | 0.561 | 0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight + | 0.118 | -0.694 | 0.697 | 0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias + | -0.012 | -1.107 | 0.306 | 0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight + | 0.658 | -1.792 | 0.905 | 0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias + | 0.000 | -0.030 | 0.037 | 0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight + | 0.003 | -0.001 | 0.007 | 0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias + | -0.000 | -0.990 | 0.880 | 0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight + | -0.010 | -1.067 | 0.596 | 0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias + | -0.010 | -5.061 | 1.229 | 0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight + | 0.024 | -1.274 | 0.830 | 0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias + | -0.009 | -1.787 | 0.563 | 0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight + | 0.130 | -0.685 | 0.743 | 0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias + | -0.011 | -0.973 | 0.292 | 0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight + | 0.659 | -1.855 | 0.931 | 0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias + | 0.000 | -0.034 | 0.040 | 0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight + | -0.001 | -0.009 | 0.007 | 0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias + | -0.000 | -0.973 | 0.853 | 0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight + | 0.022 | -1.001 | 0.571 | 0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias + | -0.009 | -5.095 | 1.251 | 0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight + | 0.026 | -1.305 | 0.880 | 0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias + | -0.008 | -1.815 | 0.561 | 0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight + | 0.137 | -0.711 | 0.771 | 0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias + | -0.010 | -0.986 | 0.286 | 0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight + | 0.671 | -1.913 | 0.966 | 0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias + | 0.000 | -0.034 | 0.028 | 0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight + | 0.002 | -0.013 | 0.016 | 0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias + | 1.280 | 0.669 | 1.862 | 0.274 | torch.Size([120]) || stage1.reshape.1.weight + | -0.006 | -0.324 | 0.337 | 0.106 | torch.Size([120]) || stage1.reshape.1.bias + | 0.579 | 0.129 | 1.064 | 0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight + | -0.039 | -1.100 | 0.894 | 0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias + | -0.134 | -4.020 | 2.585 | 0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.579 | 0.618 | 0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight + | 0.000 | -0.319 | 0.279 | 0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias + | 0.001 | -0.634 | 0.686 | 0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight + | -0.014 | -0.222 | 0.642 | 0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -1.066 | 0.928 | 0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.146 | 0.190 | 0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.781 | 0.367 | 1.203 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight + | 0.029 | -0.378 | 0.545 | 0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias + | 0.001 | -0.687 | 0.753 | 0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight + | -0.010 | -0.229 | 0.633 | 0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.674 | 0.669 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight + | 0.011 | -0.448 | 0.368 | 0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.862 | 0.941 | 0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight + | -0.004 | -0.267 | 0.594 | 0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias + | 0.797 | 0.211 | 1.475 | 0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight + | -0.161 | -1.941 | 0.746 | 0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias + | -0.296 | -3.927 | 2.840 | 0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias + | 0.001 | -1.479 | 1.395 | 0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight + | -0.003 | -0.381 | 0.258 | 0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.526 | 0.561 | 0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.178 | 0.478 | 0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -1.242 | 1.138 | 0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.004 | -0.213 | 0.196 | 0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.702 | 0.349 | 0.904 | 0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight + | 0.039 | -0.646 | 0.384 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.872 | 0.750 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight + | -0.049 | -0.353 | 0.135 | 0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.562 | 0.580 | 0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.238 | 0.457 | 0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.828 | 0.685 | 0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight + | 0.031 | -0.297 | 0.419 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias + | 0.984 | 0.163 | 1.398 | 0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight + | -0.167 | -1.609 | 0.367 | 0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias + | -0.343 | -4.484 | 2.362 | 0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias + | 0.000 | -1.586 | 1.649 | 0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.220 | 0.240 | 0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.378 | 0.514 | 0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight + | -0.009 | -0.143 | 0.172 | 0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias + | 0.001 | -0.639 | 0.582 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.173 | 0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.733 | 0.277 | 0.903 | 0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight + | 0.038 | -0.861 | 0.359 | 0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.787 | 0.679 | 0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight + | -0.029 | -0.365 | 0.143 | 0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.574 | 0.539 | 0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.283 | 0.254 | 0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias + | 0.001 | -0.998 | 0.522 | 0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight + | 0.030 | -0.169 | 0.293 | 0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias + | 1.035 | 0.143 | 1.397 | 0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight + | -0.161 | -1.413 | 0.084 | 0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias + | -0.441 | -4.685 | 3.306 | 0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias + | 0.000 | -1.590 | 1.329 | 0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.266 | 0.232 | 0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.366 | 0.372 | 0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight + | -0.011 | -0.225 | 0.171 | 0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.660 | 0.801 | 0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.139 | 0.200 | 0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.724 | 0.190 | 0.911 | 0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight + | 0.038 | -0.981 | 0.285 | 0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.611 | 0.598 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight + | -0.035 | -0.299 | 0.221 | 0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.502 | 0.520 | 0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.271 | 0.215 | 0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.558 | 0.898 | 0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight + | 0.010 | -0.424 | 0.190 | 0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias + | 1.085 | 0.169 | 1.400 | 0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight + | -0.086 | -1.613 | 0.150 | 0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias + | -0.541 | -3.902 | 3.728 | 0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias + | 0.001 | -1.879 | 1.832 | 0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight + | 0.001 | -0.391 | 0.444 | 0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.407 | 0.448 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight + | -0.013 | -0.302 | 0.342 | 0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias + | -0.001 | -0.830 | 0.863 | 0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.117 | 0.094 | 0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.704 | 0.195 | 0.870 | 0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight + | 0.031 | -1.069 | 0.276 | 0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.656 | 0.555 | 0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight + | -0.029 | -0.387 | 0.256 | 0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias + | 0.001 | -0.590 | 0.624 | 0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight + | -0.011 | -0.277 | 0.303 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -1.124 | 0.539 | 0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight + | -0.006 | -0.718 | 0.133 | 0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias + | 1.037 | 0.176 | 1.327 | 0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight + | -0.112 | -1.591 | 0.177 | 0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias + | -0.438 | -2.229 | 2.797 | 0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias + | -0.000 | -2.212 | 1.826 | 0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight + | 0.001 | -0.343 | 0.338 | 0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.367 | 0.451 | 0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight + | -0.022 | -0.358 | 0.242 | 0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias + | 0.001 | -0.922 | 0.886 | 0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.002 | -0.083 | 0.089 | 0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.662 | 0.277 | 0.831 | 0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight + | 0.025 | -0.959 | 0.261 | 0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias + | -0.001 | -0.636 | 0.739 | 0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight + | -0.030 | -0.419 | 0.517 | 0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.615 | 0.709 | 0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight + | 0.002 | -0.230 | 0.457 | 0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias + | 0.001 | -1.724 | 1.186 | 0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight + | -0.019 | -1.909 | 0.255 | 0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.242 | 0.244 | 0.057 | torch.Size([120, 120]) || stage1.linear1.weight + | 0.004 | -0.221 | 0.224 | 0.083 | torch.Size([120]) || stage1.linear1.bias + | 0.737 | 0.334 | 1.046 | 0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight + | 0.013 | -0.911 | 0.763 | 0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias + | -0.052 | -2.462 | 2.040 | 0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.785 | 0.767 | 0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight + | 0.009 | -0.466 | 0.552 | 0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.431 | 0.475 | 0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight + | -0.009 | -0.796 | 0.497 | 0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias + | 0.573 | 0.409 | 0.935 | 0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight + | 0.015 | -0.828 | 0.839 | 0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.604 | 0.542 | 0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight + | 0.037 | -0.179 | 0.273 | 0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.666 | 0.553 | 0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight + | -0.001 | -0.416 | 0.396 | 0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias + | 0.001 | -0.654 | 0.538 | 0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight + | -0.002 | -0.470 | 0.310 | 0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias + | 0.951 | 0.342 | 1.189 | 0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight + | 0.010 | -0.697 | 0.802 | 0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias + | -0.098 | -2.648 | 2.410 | 0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.733 | 0.886 | 0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight + | -0.002 | -0.468 | 0.550 | 0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.435 | 0.377 | 0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.359 | 0.258 | 0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias + | 0.582 | 0.305 | 0.717 | 0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight + | 0.008 | -0.714 | 0.833 | 0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias + | 0.001 | -0.732 | 0.501 | 0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight + | 0.004 | -0.306 | 0.267 | 0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.510 | 0.533 | 0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.315 | 0.291 | 0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.736 | 0.789 | 0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight + | -0.000 | -1.274 | 1.328 | 0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.390 | 0.303 | 0.069 | torch.Size([120, 120]) || stage1.linear2.weight + | 0.010 | -0.219 | 0.227 | 0.087 | torch.Size([120]) || stage1.linear2.bias + | -0.000 | -0.095 | 0.106 | 0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight + | -0.001 | -0.036 | 0.036 | 0.013 | torch.Size([120]) || stage1.pa_deform.bias + | -0.000 | -0.136 | 0.141 | 0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight + | -0.002 | -0.028 | 0.024 | 0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias + | -0.001 | -0.156 | 0.104 | 0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight + | -0.008 | -0.055 | 0.045 | 0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias + | -0.001 | -0.098 | 0.106 | 0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight + | -0.000 | -0.081 | 0.070 | 0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias + | -0.000 | -0.375 | 0.279 | 0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight + | -0.003 | -0.074 | 0.070 | 0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias + | -0.000 | -0.776 | 0.733 | 0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight + | 0.021 | -0.239 | 0.513 | 0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias + | 0.001 | -1.100 | 1.143 | 0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight + | 0.008 | -0.405 | 0.393 | 0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias + | 0.000 | -0.963 | 0.899 | 0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight + | -0.055 | -0.616 | 0.599 | 0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias + | 1.149 | 0.345 | 1.921 | 0.289 | torch.Size([480]) || stage2.reshape.1.weight + | 0.017 | -0.502 | 0.663 | 0.141 | torch.Size([480]) || stage2.reshape.1.bias + | -0.000 | -0.609 | 0.736 | 0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight + | 0.006 | -0.136 | 0.404 | 0.077 | torch.Size([120]) || stage2.reshape.2.bias + | 0.686 | 0.172 | 1.113 | 0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight + | -0.154 | -0.926 | 0.339 | 0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias + | -0.120 | -1.869 | 4.616 | 0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.514 | 0.499 | 0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight + | -0.002 | -0.214 | 0.177 | 0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.499 | 0.529 | 0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight + | -0.004 | -0.171 | 0.556 | 0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.642 | 0.598 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.141 | 0.125 | 0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.592 | 0.325 | 0.794 | 0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight + | 0.008 | -0.649 | 0.445 | 0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.485 | 0.457 | 0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight + | -0.053 | -0.240 | 0.171 | 0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.503 | 0.462 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight + | 0.005 | -0.177 | 0.268 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.690 | 0.498 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight + | -0.007 | -0.270 | 0.472 | 0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias + | 0.864 | 0.187 | 1.221 | 0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight + | -0.146 | -1.128 | 0.299 | 0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias + | -0.241 | -1.607 | 8.958 | 0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.561 | 0.538 | 0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.198 | 0.222 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias + | 0.001 | -0.475 | 0.479 | 0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight + | -0.006 | -0.295 | 0.341 | 0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias + | 0.001 | -0.961 | 0.789 | 0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.105 | 0.143 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.653 | 0.401 | 0.810 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight + | 0.009 | -0.767 | 0.367 | 0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.486 | 0.499 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight + | -0.056 | -0.185 | 0.147 | 0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.548 | 0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight + | 0.002 | -0.231 | 0.177 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias + | -0.001 | -0.578 | 0.609 | 0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight + | -0.003 | -0.350 | 0.216 | 0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias + | 0.848 | 0.172 | 1.107 | 0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight + | -0.168 | -1.123 | 0.330 | 0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias + | -0.074 | -1.239 | 4.293 | 0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias + | -0.001 | -0.643 | 0.531 | 0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight + | 0.003 | -0.220 | 0.376 | 0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.529 | 0.479 | 0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.230 | 0.295 | 0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -0.726 | 0.768 | 0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.001 | -0.167 | 0.193 | 0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.695 | 0.334 | 0.833 | 0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight + | 0.012 | -0.755 | 0.517 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.474 | 0.480 | 0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight + | -0.049 | -0.218 | 0.148 | 0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.529 | 0.542 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight + | -0.006 | -0.245 | 0.239 | 0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.541 | 0.485 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.318 | 0.170 | 0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias + | 0.903 | 0.178 | 1.124 | 0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight + | -0.138 | -1.223 | 0.440 | 0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias + | -0.164 | -1.383 | 5.910 | 0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.526 | 0.496 | 0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.250 | 0.273 | 0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.447 | 0.524 | 0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.243 | 0.256 | 0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias + | -0.001 | -0.551 | 0.730 | 0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.001 | -0.145 | 0.126 | 0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.707 | 0.319 | 0.855 | 0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight + | 0.013 | -0.839 | 0.507 | 0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.509 | 0.508 | 0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight + | -0.051 | -0.219 | 0.155 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.475 | 0.592 | 0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight + | -0.002 | -0.162 | 0.220 | 0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.465 | 0.528 | 0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.243 | 0.286 | 0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias + | 0.948 | 0.220 | 1.175 | 0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight + | -0.125 | -1.093 | 0.385 | 0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias + | -0.150 | -1.632 | 4.522 | 0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.636 | 0.543 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight + | -0.001 | -0.254 | 0.262 | 0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias + | 0.001 | -0.632 | 0.628 | 0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight + | -0.005 | -0.240 | 0.330 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.476 | 0.479 | 0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.001 | -0.112 | 0.134 | 0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.686 | 0.264 | 0.797 | 0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight + | 0.012 | -0.889 | 0.427 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.476 | 0.478 | 0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight + | -0.051 | -0.267 | 0.180 | 0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.506 | 0.517 | 0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.172 | 0.241 | 0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.570 | 0.542 | 0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight + | -0.003 | -0.631 | 0.395 | 0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias + | 0.912 | 0.189 | 1.122 | 0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight + | -0.114 | -1.125 | 0.188 | 0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias + | -0.099 | -1.285 | 1.708 | 0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.496 | 0.540 | 0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight + | 0.003 | -0.260 | 0.228 | 0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.511 | 0.454 | 0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.711 | 0.286 | 0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.444 | 0.454 | 0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.101 | 0.133 | 0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.668 | 0.312 | 0.800 | 0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight + | 0.015 | -0.778 | 0.372 | 0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.485 | 0.469 | 0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight + | -0.045 | -0.294 | 0.173 | 0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.554 | 0.540 | 0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight + | 0.001 | -0.183 | 0.199 | 0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.879 | 0.824 | 0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.670 | 0.358 | 0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias + | 0.001 | -0.253 | 0.346 | 0.068 | torch.Size([120, 120]) || stage2.linear1.weight + | 0.007 | -0.248 | 0.241 | 0.103 | torch.Size([120]) || stage2.linear1.bias + | 1.012 | 0.613 | 1.327 | 0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight + | 0.019 | -0.724 | 0.685 | 0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias + | 0.003 | -2.959 | 1.705 | 0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.636 | 0.617 | 0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight + | -0.002 | -0.291 | 0.292 | 0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias + | -0.002 | -0.476 | 0.512 | 0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight + | -0.002 | -0.263 | 0.398 | 0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias + | 0.677 | 0.521 | 0.840 | 0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight + | 0.010 | -0.710 | 0.541 | 0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias + | 0.001 | -0.540 | 0.507 | 0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight + | -0.016 | -0.242 | 0.201 | 0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.519 | 0.479 | 0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.162 | 0.231 | 0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias + | -0.001 | -0.449 | 0.494 | 0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight + | 0.002 | -0.293 | 0.222 | 0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias + | 1.053 | 0.832 | 1.269 | 0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight + | 0.015 | -0.549 | 0.428 | 0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias + | 0.007 | -3.099 | 1.550 | 0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.673 | 0.604 | 0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.416 | 0.391 | 0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias + | -0.000 | -0.569 | 0.560 | 0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight + | 0.004 | -0.613 | 0.428 | 0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias + | 0.762 | 0.464 | 0.954 | 0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight + | 0.005 | -0.745 | 0.381 | 0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.441 | 0.448 | 0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight + | 0.019 | -0.292 | 0.460 | 0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.491 | 0.490 | 0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight + | -0.007 | -0.285 | 0.177 | 0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.535 | 0.631 | 0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight + | -0.011 | -0.765 | 0.337 | 0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias + | 0.001 | -0.367 | 0.372 | 0.074 | torch.Size([120, 120]) || stage2.linear2.weight + | 0.009 | -0.288 | 0.342 | 0.130 | torch.Size([120]) || stage2.linear2.bias + | 0.000 | -0.112 | 0.093 | 0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight + | -0.002 | -0.036 | 0.035 | 0.016 | torch.Size([120]) || stage2.pa_deform.bias + | 0.000 | -0.068 | 0.080 | 0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight + | -0.009 | -0.035 | 0.023 | 0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias + | 0.000 | -0.068 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight + | -0.014 | -0.061 | 0.036 | 0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias + | -0.001 | -0.082 | 0.079 | 0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight + | -0.003 | -0.075 | 0.069 | 0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias + | -0.000 | -0.166 | 0.139 | 0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight + | -0.015 | -0.090 | 0.050 | 0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias + | -0.002 | -0.642 | 0.663 | 0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight + | 0.130 | -0.171 | 0.480 | 0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias + | -0.000 | -0.696 | 0.620 | 0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight + | -0.007 | -0.337 | 0.301 | 0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias + | 0.000 | -0.650 | 0.657 | 0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight + | 0.013 | -0.507 | 0.451 | 0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias + | 1.067 | 0.372 | 1.778 | 0.269 | torch.Size([480]) || stage3.reshape.1.weight + | -0.004 | -0.699 | 0.521 | 0.227 | torch.Size([480]) || stage3.reshape.1.bias + | -0.000 | -0.643 | 0.743 | 0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight + | 0.009 | -0.176 | 0.243 | 0.079 | torch.Size([120]) || stage3.reshape.2.bias + | 0.785 | 0.469 | 1.029 | 0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight + | -0.102 | -0.716 | 0.311 | 0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias + | -0.001 | -0.340 | 0.163 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.328 | 0.302 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight + | 0.004 | -0.232 | 0.189 | 0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.343 | 0.346 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight + | 0.004 | -0.335 | 0.229 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.366 | 0.325 | 0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.001 | -0.091 | 0.074 | 0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.751 | 0.517 | 0.928 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight + | 0.002 | -0.271 | 0.189 | 0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.371 | 0.388 | 0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight + | -0.073 | -0.203 | 0.039 | 0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.400 | 0.401 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.178 | 0.128 | 0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias + | -0.001 | -0.410 | 0.429 | 0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight + | 0.006 | -0.345 | 0.304 | 0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias + | 0.816 | 0.469 | 1.015 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight + | -0.103 | -0.647 | 0.225 | 0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias + | 0.001 | -0.464 | 0.239 | 0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.304 | 0.359 | 0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight + | 0.001 | -0.173 | 0.193 | 0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.299 | 0.408 | 0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight + | 0.007 | -0.511 | 0.239 | 0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.288 | 0.254 | 0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.001 | -0.060 | 0.054 | 0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.796 | 0.609 | 0.971 | 0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight + | -0.002 | -0.327 | 0.247 | 0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias + | 0.001 | -0.379 | 0.407 | 0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight + | -0.077 | -0.214 | 0.034 | 0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.391 | 0.432 | 0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.176 | 0.112 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.378 | 0.399 | 0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight + | 0.009 | -0.410 | 0.306 | 0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias + | 0.854 | 0.447 | 0.995 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight + | -0.086 | -0.513 | 0.198 | 0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias + | -0.001 | -0.189 | 0.292 | 0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.390 | 0.367 | 0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight + | -0.002 | -0.310 | 0.284 | 0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.334 | 0.296 | 0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight + | 0.004 | -0.356 | 0.299 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias + | 0.000 | -0.276 | 0.315 | 0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.094 | 0.066 | 0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.829 | 0.673 | 1.017 | 0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight + | 0.003 | -0.259 | 0.228 | 0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias + | 0.001 | -0.410 | 0.385 | 0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight + | -0.085 | -0.200 | 0.017 | 0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.348 | 0.378 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight + | 0.001 | -0.130 | 0.105 | 0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.346 | 0.425 | 0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight + | 0.005 | -0.363 | 0.241 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias + | 0.872 | 0.554 | 1.068 | 0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight + | -0.057 | -0.402 | 0.133 | 0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias + | 0.003 | -0.365 | 0.217 | 0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.359 | 0.357 | 0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight + | -0.002 | -0.265 | 0.294 | 0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.300 | 0.271 | 0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight + | 0.002 | -0.316 | 0.215 | 0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.370 | 0.329 | 0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.056 | 0.066 | 0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.842 | 0.631 | 0.989 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight + | -0.001 | -0.216 | 0.263 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.388 | 0.391 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight + | -0.087 | -0.202 | 0.032 | 0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.364 | 0.428 | 0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.137 | 0.106 | 0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias + | -0.001 | -0.390 | 0.339 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.376 | 0.203 | 0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias + | 0.913 | 0.498 | 1.102 | 0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight + | -0.048 | -0.340 | 0.105 | 0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias + | 0.001 | -0.706 | 0.306 | 0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias + | 0.000 | -0.373 | 0.339 | 0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight + | -0.004 | -0.301 | 0.301 | 0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.278 | 0.277 | 0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight + | 0.003 | -0.310 | 0.240 | 0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.350 | 0.322 | 0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.064 | 0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.862 | 0.679 | 0.990 | 0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight + | -0.004 | -0.313 | 0.190 | 0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.370 | 0.364 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight + | -0.092 | -0.231 | 0.129 | 0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.375 | 0.511 | 0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.114 | 0.114 | 0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.389 | 0.354 | 0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight + | 0.005 | -0.258 | 0.164 | 0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias + | 0.899 | 0.480 | 1.089 | 0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight + | -0.030 | -0.257 | 0.115 | 0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias + | 0.003 | -0.462 | 0.290 | 0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.391 | 0.365 | 0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight + | -0.004 | -0.232 | 0.302 | 0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.267 | 0.293 | 0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.250 | 0.182 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.238 | 0.257 | 0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.001 | -0.032 | 0.033 | 0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.864 | 0.651 | 1.029 | 0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight + | -0.003 | -0.212 | 0.175 | 0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.378 | 0.379 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight + | -0.097 | -0.308 | 0.026 | 0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.578 | 0.401 | 0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight + | -0.005 | -0.166 | 0.131 | 0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.358 | 0.376 | 0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -0.262 | 0.176 | 0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias + | 0.003 | -0.284 | 0.467 | 0.071 | torch.Size([120, 120]) || stage3.linear1.weight + | 0.006 | -0.201 | 0.269 | 0.090 | torch.Size([120]) || stage3.linear1.bias + | 0.877 | 0.568 | 1.197 | 0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight + | 0.002 | -0.248 | 0.324 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.261 | 0.125 | 0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.563 | 0.552 | 0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight + | 0.005 | -0.257 | 0.302 | 0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.390 | 0.385 | 0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight + | 0.002 | -0.450 | 0.235 | 0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias + | 0.986 | 0.755 | 1.165 | 0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.260 | 0.169 | 0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.355 | 0.397 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight + | -0.046 | -0.220 | 0.086 | 0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.424 | 0.368 | 0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight + | -0.006 | -0.111 | 0.122 | 0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.354 | 0.374 | 0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight + | 0.001 | -0.374 | 0.272 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias + | 0.919 | 0.643 | 1.132 | 0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.177 | 0.181 | 0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.332 | 0.131 | 0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.418 | 0.362 | 0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight + | -0.004 | -0.375 | 0.347 | 0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias + | -0.001 | -0.294 | 0.354 | 0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight + | 0.003 | -0.432 | 0.259 | 0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias + | 1.012 | 0.750 | 1.178 | 0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight + | -0.001 | -0.171 | 0.155 | 0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.331 | 0.356 | 0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight + | -0.035 | -0.207 | 0.197 | 0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.399 | 0.398 | 0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.111 | 0.129 | 0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias + | -0.001 | -0.353 | 0.330 | 0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.328 | 0.127 | 0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias + | 0.003 | -0.289 | 0.519 | 0.073 | torch.Size([120, 120]) || stage3.linear2.weight + | 0.002 | -0.318 | 0.371 | 0.144 | torch.Size([120]) || stage3.linear2.bias + | -0.000 | -0.086 | 0.095 | 0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight + | -0.002 | -0.023 | 0.021 | 0.010 | torch.Size([120]) || stage3.pa_deform.bias + | -0.000 | -0.060 | 0.056 | 0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight + | -0.008 | -0.035 | 0.019 | 0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias + | -0.001 | -0.064 | 0.062 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight + | -0.007 | -0.044 | 0.031 | 0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias + | 0.000 | -0.062 | 0.063 | 0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight + | -0.006 | -0.052 | 0.043 | 0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias + | 0.000 | -0.081 | 0.080 | 0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight + | -0.004 | -0.087 | 0.083 | 0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias + | -0.002 | -0.465 | 0.513 | 0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight + | 0.059 | -0.251 | 0.595 | 0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias + | -0.000 | -0.544 | 0.531 | 0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight + | 0.001 | -0.589 | 0.433 | 0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias + | -0.000 | -0.535 | 0.562 | 0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight + | -0.001 | -0.401 | 0.342 | 0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias + | 0.997 | 0.921 | 1.125 | 0.028 | torch.Size([480]) || stage4.reshape.1.weight + | -0.000 | -0.058 | 0.059 | 0.022 | torch.Size([480]) || stage4.reshape.1.bias + | 0.000 | -0.155 | 0.150 | 0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight + | 0.001 | -0.016 | 0.016 | 0.006 | torch.Size([120]) || stage4.reshape.2.bias + | 1.002 | 0.999 | 1.009 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight + | 0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.071 | 0.066 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.093 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.009 | 0.009 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias + | 0.000 | -0.080 | 0.097 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.035 | 0.027 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.080 | 0.079 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.087 | 0.092 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.080 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.031 | 0.029 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias + | 1.002 | 0.997 | 1.007 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.066 | 0.065 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.006 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.080 | 0.083 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight + | -0.000 | -0.027 | 0.029 | 0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.077 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.006 | 0.009 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.080 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.077 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias + | 0.000 | -0.084 | 0.075 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.034 | 0.031 | 0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.996 | 1.008 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias + | 0.001 | -0.070 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.091 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.007 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.080 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight + | -0.000 | -0.023 | 0.026 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.107 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.000 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.076 | 0.077 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.005 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -2.000 | 0.081 | 0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.084 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight + | 0.000 | -0.027 | 0.024 | 0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.999 | 1.012 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight + | -0.000 | -0.003 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.064 | 0.071 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.099 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight + | 0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.083 | 0.084 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight + | -0.000 | -0.019 | 0.018 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.079 | 0.084 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.078 | 0.081 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.087 | 0.076 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight + | -0.000 | -0.001 | 0.002 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias + | -0.000 | -0.079 | 0.082 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias + | 1.002 | 0.998 | 1.011 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight + | -0.001 | -0.004 | 0.003 | 0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias + | 0.000 | -0.089 | 0.081 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.080 | 0.085 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.006 | 0.005 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.075 | 0.077 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight + | -0.000 | -0.021 | 0.016 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.082 | 0.088 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.004 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.086 | 0.080 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.084 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.076 | 0.081 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight + | -0.000 | -0.018 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias + | 1.003 | 0.997 | 1.014 | 0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight + | -0.001 | -0.005 | 0.004 | 0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias + | -0.001 | -0.070 | 0.069 | 0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.097 | 0.082 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight + | 0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.075 | 0.089 | 0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.016 | 0.015 | 0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.083 | 0.091 | 0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias + | 1.000 | 0.999 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.093 | 0.083 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight + | 0.000 | -0.002 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias + | 0.000 | -0.086 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.079 | 0.092 | 0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight + | -0.000 | -0.012 | 0.016 | 0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.090 | 0.111 | 0.024 | torch.Size([120, 120]) || stage4.linear1.weight + | 0.001 | -0.019 | 0.029 | 0.009 | torch.Size([120]) || stage4.linear1.bias + | 1.000 | 0.999 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.078 | 0.075 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.084 | 0.087 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.079 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight + | 0.000 | -0.021 | 0.024 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.079 | 0.072 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias + | 0.000 | -0.077 | 0.078 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.102 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.024 | 0.020 | 0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias + | 1.001 | 0.998 | 1.003 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight + | -0.000 | -0.002 | 0.002 | 0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.071 | 0.079 | 0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.078 | 0.096 | 0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.005 | 0.006 | 0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.080 | 0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight + | 0.000 | -0.020 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias + | 1.000 | 1.000 | 1.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight + | -0.000 | -0.000 | 0.000 | 0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias + | -0.000 | -0.085 | 0.082 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight + | -0.000 | -0.001 | 0.001 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.001 | 0.000 | 0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.078 | 0.078 | 0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.022 | 0.021 | 0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias + | 0.000 | -0.092 | 0.112 | 0.023 | torch.Size([120, 120]) || stage4.linear2.weight + | 0.000 | -0.032 | 0.049 | 0.015 | torch.Size([120]) || stage4.linear2.bias + | 0.000 | -0.036 | 0.037 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage4.pa_deform.bias + | -0.000 | -0.021 | 0.022 | 0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight + | -0.001 | -0.021 | 0.021 | 0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight + | 0.002 | -0.030 | 0.030 | 0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias + | 0.000 | -0.030 | 0.030 | 0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight + | -0.002 | -0.030 | 0.030 | 0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias + | 0.000 | -0.003 | 0.002 | 0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight + | 0.000 | -0.005 | 0.004 | 0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias + | 0.000 | -0.172 | 0.177 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight + | 0.002 | -0.027 | 0.088 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias + | 0.000 | -0.212 | 0.163 | 0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight + | 0.000 | -0.066 | 0.081 | 0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias + | 0.000 | -0.413 | 0.387 | 0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight + | -0.001 | -0.198 | 0.214 | 0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias + | 0.979 | 0.896 | 1.076 | 0.053 | torch.Size([30]) || stage5.reshape.1.weight + | -0.005 | -0.074 | 0.100 | 0.043 | torch.Size([30]) || stage5.reshape.1.bias + | 0.000 | -0.240 | 0.249 | 0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight + | -0.002 | -0.286 | 0.229 | 0.080 | torch.Size([120]) || stage5.reshape.2.bias + | 1.001 | 0.993 | 1.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight + | -0.004 | -0.018 | 0.006 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.066 | 0.062 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.091 | 0.086 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.014 | 0.012 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.166 | 0.172 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.053 | 0.045 | 0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.090 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.999 | 0.987 | 1.001 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.094 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight + | 0.000 | -0.022 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.082 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.014 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias + | -0.000 | -0.075 | 0.083 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.073 | 0.078 | 0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias + | 1.001 | 0.994 | 1.007 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight + | -0.004 | -0.016 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.065 | 0.063 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias + | -0.000 | -0.077 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight + | 0.000 | -0.022 | 0.017 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias + | -0.000 | -0.113 | 0.098 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.058 | 0.045 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.080 | 0.080 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.000 | -0.008 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.999 | 0.982 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight + | 0.000 | -0.006 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias + | -0.000 | -0.076 | 0.083 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight + | 0.000 | -0.017 | 0.014 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias + | 0.000 | -0.080 | 0.086 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.014 | 0.016 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.096 | 0.079 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight + | 0.001 | -0.051 | 0.039 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias + | 1.002 | 0.998 | 1.009 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight + | -0.004 | -0.014 | 0.003 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias + | 0.000 | -0.067 | 0.073 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.085 | 0.087 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.015 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.108 | 0.095 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight + | -0.001 | -0.043 | 0.039 | 0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.088 | 0.081 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.009 | 0.007 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.999 | 0.978 | 1.001 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight + | 0.000 | -0.003 | 0.004 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.076 | 0.081 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight + | -0.000 | -0.012 | 0.019 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias + | 0.000 | -0.079 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight + | -0.001 | -0.014 | 0.012 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias + | 0.000 | -0.076 | 0.082 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight + | -0.000 | -0.047 | 0.043 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias + | 1.002 | 0.978 | 1.015 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight + | -0.004 | -0.013 | 0.004 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias + | -0.000 | -0.084 | 0.070 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.078 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight + | -0.000 | -0.014 | 0.014 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias + | -0.000 | -0.123 | 0.132 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight + | 0.001 | -0.028 | 0.044 | 0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -0.082 | 0.089 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight + | -0.000 | -0.007 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.999 | 0.974 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight + | 0.000 | -0.008 | 0.010 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias + | 0.000 | -0.075 | 0.088 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight + | 0.000 | -0.014 | 0.019 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.081 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight + | 0.000 | -0.031 | 0.020 | 0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.081 | 0.106 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight + | -0.002 | -0.046 | 0.042 | 0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias + | 1.003 | 0.944 | 1.017 | 0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight + | -0.005 | -0.015 | 0.004 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.071 | 0.067 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.085 | 0.090 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.021 | 0.013 | 0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias + | 0.000 | -0.130 | 0.089 | 0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight + | -0.001 | -0.036 | 0.024 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -0.086 | 0.076 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.008 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.999 | 0.967 | 1.001 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight + | 0.000 | -0.006 | 0.007 | 0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias + | 0.000 | -0.080 | 0.085 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight + | -0.001 | -0.015 | 0.010 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.081 | 0.077 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight + | -0.000 | -0.020 | 0.018 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias + | 0.000 | -0.081 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight + | -0.001 | -0.037 | 0.050 | 0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias + | 1.004 | 0.976 | 1.039 | 0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight + | -0.005 | -0.015 | 0.005 | 0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.070 | 0.076 | 0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.099 | 0.097 | 0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight + | -0.000 | -0.011 | 0.012 | 0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias + | -0.000 | -0.084 | 0.093 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight + | 0.000 | -0.038 | 0.035 | 0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.087 | 0.082 | 0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.008 | 0.010 | 0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.998 | 0.960 | 1.002 | 0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight + | 0.000 | -0.006 | 0.006 | 0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.088 | 0.095 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight + | -0.000 | -0.014 | 0.027 | 0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.081 | 0.074 | 0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight + | 0.000 | -0.013 | 0.025 | 0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -0.100 | 0.086 | 0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight + | 0.000 | -0.022 | 0.030 | 0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.102 | 0.117 | 0.023 | torch.Size([120, 120]) || stage5.linear1.weight + | -0.003 | -0.297 | 0.242 | 0.084 | torch.Size([120]) || stage5.linear1.bias + | 0.999 | 0.971 | 1.008 | 0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight + | -0.000 | -0.035 | 0.034 | 0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias + | 0.000 | -0.079 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.087 | 0.083 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.028 | 0.018 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias + | 0.000 | -0.079 | 0.082 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight + | -0.001 | -0.146 | 0.171 | 0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias + | 0.997 | 0.967 | 1.003 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight + | 0.000 | -0.005 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias + | -0.000 | -0.073 | 0.089 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight + | -0.002 | -0.017 | 0.008 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.084 | 0.073 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight + | 0.000 | -0.013 | 0.011 | 0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.083 | 0.085 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight + | 0.000 | -0.103 | 0.140 | 0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias + | 0.999 | 0.986 | 1.010 | 0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight + | 0.000 | -0.035 | 0.034 | 0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias + | 0.000 | -0.087 | 0.074 | 0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.084 | 0.079 | 0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.024 | 0.024 | 0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.077 | 0.078 | 0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.112 | 0.144 | 0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias + | 0.998 | 0.965 | 1.004 | 0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight + | 0.000 | -0.004 | 0.005 | 0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.088 | 0.079 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight + | -0.001 | -0.012 | 0.015 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias + | -0.000 | -0.102 | 0.080 | 0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight + | 0.000 | -0.012 | 0.009 | 0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias + | 0.000 | -0.075 | 0.078 | 0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight + | 0.000 | -0.105 | 0.131 | 0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.220 | 0.209 | 0.035 | torch.Size([120, 120]) || stage5.linear2.weight + | -0.003 | -0.335 | 0.284 | 0.096 | torch.Size([120]) || stage5.linear2.bias + | -0.000 | -0.064 | 0.065 | 0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight + | 0.001 | -0.050 | 0.050 | 0.029 | torch.Size([120]) || stage5.pa_deform.bias + | 0.000 | -0.119 | 0.106 | 0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight + | -0.006 | -0.030 | 0.026 | 0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias + | -0.001 | -0.055 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight + | 0.001 | -0.033 | 0.031 | 0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias + | 0.001 | -0.060 | 0.050 | 0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight + | -0.005 | -0.040 | 0.037 | 0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias + | 0.001 | -0.038 | 0.051 | 0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight + | 0.000 | -0.048 | 0.050 | 0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias + | 0.000 | -0.334 | 0.340 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight + | 0.037 | -0.050 | 0.294 | 0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias + | -0.000 | -0.343 | 0.349 | 0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight + | -0.001 | -0.237 | 0.244 | 0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias + | -0.000 | -0.575 | 0.591 | 0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight + | -0.001 | -0.404 | 0.344 | 0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias + | 1.254 | 1.058 | 1.466 | 0.126 | torch.Size([30]) || stage6.reshape.1.weight + | -0.001 | -0.074 | 0.093 | 0.041 | torch.Size([30]) || stage6.reshape.1.bias + | 0.000 | -0.734 | 0.625 | 0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight + | 0.003 | -0.269 | 0.341 | 0.108 | torch.Size([120]) || stage6.reshape.2.bias + | 0.815 | 0.495 | 1.118 | 0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight + | -0.071 | -0.291 | 0.263 | 0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias + | -0.000 | -0.080 | 0.087 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias + | 0.000 | -0.136 | 0.134 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight + | -0.000 | -0.061 | 0.037 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias + | -0.000 | -0.201 | 0.182 | 0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight + | 0.000 | -0.223 | 0.189 | 0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias + | 0.000 | -0.184 | 0.211 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight + | 0.000 | -0.049 | 0.069 | 0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.710 | 0.556 | 0.893 | 0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight + | -0.003 | -0.172 | 0.193 | 0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias + | 0.000 | -0.217 | 0.211 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight + | -0.041 | -0.158 | 0.025 | 0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias + | 0.000 | -0.209 | 0.178 | 0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight + | -0.000 | -0.141 | 0.186 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias + | 0.000 | -0.245 | 0.347 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight + | 0.005 | -0.161 | 0.188 | 0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias + | 0.780 | 0.582 | 0.963 | 0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight + | -0.112 | -0.302 | 0.103 | 0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias + | 0.000 | -0.101 | 0.072 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.112 | 0.178 | 0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight + | -0.000 | -0.034 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.223 | 0.242 | 0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight + | -0.003 | -0.149 | 0.105 | 0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias + | 0.000 | -0.199 | 0.173 | 0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight + | 0.000 | -0.035 | 0.056 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.744 | 0.530 | 0.917 | 0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight + | 0.004 | -0.131 | 0.180 | 0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.243 | 0.294 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight + | -0.039 | -0.217 | 0.045 | 0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.206 | 0.178 | 0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight + | -0.000 | -0.129 | 0.125 | 0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.236 | 0.276 | 0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight + | 0.000 | -0.158 | 0.170 | 0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias + | 0.829 | 0.586 | 1.007 | 0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight + | -0.101 | -0.353 | 0.132 | 0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias + | -0.000 | -0.082 | 0.076 | 0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias + | 0.000 | -0.154 | 0.143 | 0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight + | 0.000 | -0.041 | 0.038 | 0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias + | 0.000 | -0.187 | 0.202 | 0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.096 | 0.127 | 0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias + | -0.000 | -0.203 | 0.185 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight + | -0.000 | -0.045 | 0.049 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.768 | 0.491 | 0.904 | 0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight + | 0.001 | -0.146 | 0.159 | 0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias + | -0.000 | -0.184 | 0.204 | 0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight + | -0.043 | -0.185 | 0.020 | 0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.188 | 0.270 | 0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight + | 0.000 | -0.152 | 0.134 | 0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias + | -0.000 | -0.222 | 0.217 | 0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight + | 0.002 | -0.141 | 0.144 | 0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias + | 0.820 | 0.554 | 0.976 | 0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight + | -0.091 | -0.336 | 0.137 | 0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias + | 0.000 | -0.124 | 0.222 | 0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias + | 0.000 | -0.157 | 0.175 | 0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight + | -0.001 | -0.049 | 0.049 | 0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias + | 0.000 | -0.238 | 0.236 | 0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight + | -0.003 | -0.077 | 0.074 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias + | 0.000 | -0.212 | 0.265 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.000 | -0.028 | 0.052 | 0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.768 | 0.530 | 0.903 | 0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight + | 0.002 | -0.104 | 0.157 | 0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias + | -0.000 | -0.197 | 0.220 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight + | -0.042 | -0.155 | 0.043 | 0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias + | 0.000 | -0.166 | 0.199 | 0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight + | 0.001 | -0.102 | 0.138 | 0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias + | 0.000 | -0.241 | 0.256 | 0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight + | 0.003 | -0.123 | 0.115 | 0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias + | 0.817 | 0.631 | 0.918 | 0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight + | -0.082 | -0.295 | 0.141 | 0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias + | -0.000 | -0.084 | 0.205 | 0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias + | -0.000 | -0.174 | 0.199 | 0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight + | -0.000 | -0.060 | 0.081 | 0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias + | -0.000 | -0.194 | 0.191 | 0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight + | 0.001 | -0.083 | 0.077 | 0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias + | -0.000 | -0.218 | 0.243 | 0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight + | -0.000 | -0.031 | 0.024 | 0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.744 | 0.478 | 0.913 | 0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight + | -0.003 | -0.146 | 0.110 | 0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias + | -0.000 | -0.223 | 0.238 | 0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight + | -0.046 | -0.200 | 0.071 | 0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias + | -0.000 | -0.168 | 0.201 | 0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight + | 0.002 | -0.128 | 0.141 | 0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias + | -0.000 | -0.220 | 0.205 | 0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.086 | 0.094 | 0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias + | 0.754 | 0.353 | 0.933 | 0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight + | -0.058 | -0.246 | 0.105 | 0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias + | -0.000 | -0.113 | 0.536 | 0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias + | 0.000 | -0.261 | 0.224 | 0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight + | 0.002 | -0.050 | 0.067 | 0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias + | 0.000 | -0.234 | 0.256 | 0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight + | 0.002 | -0.079 | 0.076 | 0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias + | -0.000 | -0.211 | 0.231 | 0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight + | 0.000 | -0.033 | 0.030 | 0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.677 | 0.275 | 0.833 | 0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight + | 0.001 | -0.224 | 0.306 | 0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias + | -0.000 | -0.196 | 0.211 | 0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight + | -0.061 | -0.289 | 0.136 | 0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.271 | 0.312 | 0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight + | 0.003 | -0.166 | 0.155 | 0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias + | 0.000 | -0.286 | 0.375 | 0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight + | 0.005 | -0.054 | 0.137 | 0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias + | -0.000 | -0.174 | 0.172 | 0.039 | torch.Size([120, 120]) || stage6.linear1.weight + | 0.002 | -0.275 | 0.348 | 0.113 | torch.Size([120]) || stage6.linear1.bias + | 0.704 | 0.402 | 1.002 | 0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight + | 0.001 | -0.466 | 0.407 | 0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias + | -0.000 | -0.172 | 0.570 | 0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index + | 0.000 | -0.337 | 0.378 | 0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight + | -0.000 | -0.071 | 0.068 | 0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias + | 0.001 | -0.290 | 0.321 | 0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight + | 0.001 | -0.255 | 0.250 | 0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias + | 0.695 | 0.353 | 0.966 | 0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight + | -0.001 | -0.218 | 0.165 | 0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.259 | 0.255 | 0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight + | -0.044 | -0.256 | 0.042 | 0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.234 | 0.214 | 0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight + | 0.002 | -0.133 | 0.091 | 0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias + | 0.000 | -0.333 | 0.296 | 0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight + | 0.003 | -0.238 | 0.280 | 0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias + | 0.671 | 0.425 | 0.980 | 0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight + | 0.001 | -0.261 | 0.305 | 0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias + | -0.000 | -0.372 | 0.942 | 0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index + | 0.000 | -0.450 | 0.494 | 0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight + | 0.000 | -0.133 | 0.119 | 0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias + | 0.000 | -0.239 | 0.288 | 0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight + | -0.001 | -0.187 | 0.157 | 0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias + | 0.687 | 0.160 | 0.907 | 0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight + | -0.002 | -0.192 | 0.222 | 0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.257 | 0.426 | 0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight + | -0.064 | -0.207 | 0.036 | 0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.269 | 0.224 | 0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight + | -0.000 | -0.126 | 0.129 | 0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.308 | 0.298 | 0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight + | -0.004 | -0.180 | 0.192 | 0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias + | -0.000 | -0.297 | 0.368 | 0.069 | torch.Size([120, 120]) || stage6.linear2.weight + | 0.001 | -0.431 | 0.480 | 0.189 | torch.Size([120]) || stage6.linear2.bias + | 0.000 | -0.100 | 0.104 | 0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight + | 0.001 | -0.018 | 0.029 | 0.010 | torch.Size([120]) || stage6.pa_deform.bias + | 0.000 | -0.105 | 0.111 | 0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight + | -0.007 | -0.033 | 0.024 | 0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias + | -0.001 | -0.071 | 0.067 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight + | -0.003 | -0.061 | 0.043 | 0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias + | -0.000 | -0.074 | 0.068 | 0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight + | 0.001 | -0.075 | 0.056 | 0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias + | 0.001 | -0.124 | 0.108 | 0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight + | -0.001 | -0.113 | 0.076 | 0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias + | -0.001 | -0.517 | 0.524 | 0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight + | 0.154 | -0.305 | 0.679 | 0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias + | 0.000 | -0.680 | 0.728 | 0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight + | 0.020 | -0.514 | 0.417 | 0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias + | -0.000 | -0.587 | 0.737 | 0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight + | 0.015 | -0.437 | 0.490 | 0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias + | 1.284 | 1.119 | 1.404 | 0.055 | torch.Size([30]) || stage7.reshape.1.weight + | -0.014 | -0.286 | 0.184 | 0.122 | torch.Size([30]) || stage7.reshape.1.bias + | -0.000 | -0.521 | 0.576 | 0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight + | 0.004 | -0.387 | 0.738 | 0.175 | torch.Size([120]) || stage7.reshape.2.bias + | 0.440 | 0.099 | 0.775 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight + | -0.177 | -0.670 | 0.319 | 0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias + | -0.055 | -2.159 | 1.979 | 0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias + | -0.000 | -0.535 | 0.554 | 0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight + | 0.003 | -0.193 | 0.281 | 0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias + | -0.001 | -0.397 | 0.395 | 0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight + | -0.001 | -0.232 | 0.692 | 0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias + | -0.000 | -0.899 | 1.073 | 0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight + | -0.000 | -0.122 | 0.104 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias + | 0.310 | 0.157 | 0.440 | 0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight + | 0.006 | -0.474 | 0.266 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias + | -0.000 | -0.605 | 0.490 | 0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight + | -0.101 | -0.310 | 0.126 | 0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias + | -0.000 | -0.448 | 0.475 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight + | 0.006 | -0.185 | 0.215 | 0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias + | 0.001 | -0.465 | 0.512 | 0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight + | 0.000 | -0.150 | 0.417 | 0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias + | 0.577 | 0.165 | 0.829 | 0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight + | -0.136 | -0.849 | 0.206 | 0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias + | -0.143 | -3.020 | 4.621 | 0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias + | 0.000 | -0.647 | 0.640 | 0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight + | -0.002 | -0.356 | 0.382 | 0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias + | 0.000 | -0.457 | 0.378 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight + | 0.000 | -0.250 | 0.707 | 0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias + | -0.001 | -1.055 | 1.091 | 0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight + | -0.001 | -0.093 | 0.123 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias + | 0.411 | 0.265 | 0.535 | 0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight + | 0.008 | -0.630 | 0.264 | 0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias + | 0.000 | -0.501 | 0.506 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight + | -0.087 | -0.341 | 0.140 | 0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias + | -0.000 | -0.450 | 0.527 | 0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight + | 0.005 | -0.188 | 0.171 | 0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias + | -0.000 | -0.554 | 0.546 | 0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight + | -0.000 | -0.135 | 0.220 | 0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias + | 0.655 | 0.134 | 0.896 | 0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight + | -0.139 | -0.788 | 0.181 | 0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias + | -0.062 | -3.469 | 3.276 | 0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias + | -0.000 | -0.592 | 0.650 | 0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight + | -0.000 | -0.308 | 0.218 | 0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias + | -0.000 | -0.355 | 0.345 | 0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight + | 0.002 | -0.213 | 0.700 | 0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias + | -0.001 | -1.166 | 0.942 | 0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight + | 0.000 | -0.106 | 0.093 | 0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias + | 0.466 | 0.317 | 0.565 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight + | 0.014 | -0.657 | 0.280 | 0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias + | 0.000 | -0.541 | 0.494 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight + | -0.079 | -0.335 | 0.122 | 0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias + | -0.000 | -0.513 | 0.493 | 0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight + | -0.007 | -0.180 | 0.175 | 0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias + | -0.001 | -0.509 | 0.479 | 0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight + | 0.004 | -0.093 | 0.293 | 0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias + | 0.693 | 0.147 | 0.945 | 0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight + | -0.132 | -0.906 | 0.249 | 0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias + | -0.108 | -3.576 | 4.241 | 0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias + | -0.000 | -0.945 | 1.095 | 0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight + | 0.003 | -0.274 | 0.204 | 0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias + | -0.001 | -0.379 | 0.351 | 0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight + | 0.000 | -0.211 | 0.587 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias + | -0.000 | -1.269 | 1.067 | 0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight + | 0.001 | -0.091 | 0.117 | 0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias + | 0.499 | 0.285 | 0.570 | 0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight + | 0.012 | -0.567 | 0.273 | 0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias + | 0.001 | -0.528 | 0.499 | 0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight + | -0.084 | -0.349 | 0.141 | 0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias + | -0.000 | -0.547 | 0.592 | 0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight + | 0.002 | -0.154 | 0.176 | 0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias + | 0.001 | -0.520 | 0.480 | 0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight + | 0.001 | -0.150 | 0.207 | 0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias + | 0.726 | 0.137 | 1.004 | 0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight + | -0.122 | -0.907 | 0.180 | 0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias + | -0.078 | -3.824 | 4.241 | 0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias + | -0.000 | -1.188 | 0.796 | 0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight + | 0.002 | -0.248 | 0.207 | 0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias + | -0.001 | -0.409 | 0.369 | 0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight + | 0.002 | -0.224 | 0.322 | 0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias + | 0.000 | -1.744 | 1.273 | 0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight + | 0.001 | -0.092 | 0.113 | 0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias + | 0.514 | 0.277 | 0.614 | 0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight + | 0.016 | -0.621 | 0.286 | 0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias + | 0.001 | -0.517 | 0.453 | 0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight + | -0.064 | -0.260 | 0.143 | 0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias + | 0.000 | -0.503 | 0.554 | 0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight + | -0.004 | -0.232 | 0.193 | 0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias + | -0.001 | -0.595 | 0.543 | 0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight + | 0.001 | -0.196 | 0.198 | 0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias + | 0.731 | 0.152 | 1.075 | 0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight + | -0.076 | -1.003 | 0.176 | 0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias + | -0.121 | -3.281 | 4.671 | 0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table + | 337.000 | 0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index + | 0.487 | -1.000 | 1.000 | 0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias + | -0.000 | -0.640 | 1.083 | 0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight + | -0.001 | -0.239 | 0.314 | 0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias + | 0.001 | -0.344 | 0.452 | 0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight + | 0.004 | -0.361 | 0.251 | 0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias + | 0.000 | -0.637 | 0.806 | 0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight + | -0.000 | -0.088 | 0.091 | 0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias + | 0.514 | 0.238 | 0.594 | 0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight + | 0.017 | -0.650 | 0.162 | 0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias + | 0.000 | -0.442 | 0.479 | 0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight + | -0.040 | -0.400 | 0.203 | 0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias + | -0.000 | -0.541 | 0.514 | 0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight + | -0.008 | -0.319 | 0.309 | 0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias + | -0.000 | -1.018 | 1.398 | 0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight + | 0.001 | -1.606 | 0.269 | 0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias + | 0.000 | -0.186 | 0.207 | 0.048 | torch.Size([120, 120]) || stage7.linear1.weight + | 0.010 | -0.448 | 0.437 | 0.161 | torch.Size([120]) || stage7.linear1.bias + | 0.703 | 0.381 | 0.856 | 0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight + | 0.014 | -0.645 | 0.486 | 0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias + | -0.007 | -4.468 | 1.008 | 0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index + | -0.000 | -0.625 | 0.834 | 0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight + | -0.009 | -0.737 | 0.632 | 0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias + | -0.000 | -0.403 | 0.406 | 0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight + | -0.007 | -0.338 | 0.165 | 0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias + | 0.435 | 0.323 | 0.526 | 0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight + | 0.005 | -0.678 | 0.379 | 0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias + | 0.000 | -0.465 | 0.467 | 0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight + | -0.031 | -0.236 | 0.180 | 0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias + | -0.000 | -0.490 | 0.520 | 0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight + | -0.003 | -0.197 | 0.242 | 0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias + | -0.000 | -0.525 | 0.501 | 0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight + | -0.005 | -0.431 | 0.164 | 0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias + | 0.703 | 0.306 | 0.866 | 0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight + | 0.009 | -0.647 | 0.481 | 0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias + | -0.010 | -3.504 | 1.842 | 0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index + | -0.000 | -0.639 | 0.590 | 0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight + | -0.001 | -0.613 | 0.609 | 0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias + | 0.001 | -0.316 | 0.325 | 0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight + | -0.004 | -0.350 | 0.145 | 0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias + | 0.452 | 0.309 | 0.558 | 0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight + | 0.003 | -0.661 | 0.246 | 0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias + | 0.000 | -0.580 | 0.410 | 0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight + | -0.020 | -0.258 | 0.299 | 0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.561 | 0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight + | -0.002 | -0.234 | 0.434 | 0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias + | -0.000 | -0.778 | 0.581 | 0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight + | -0.001 | -0.888 | 0.286 | 0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias + | -0.001 | -0.348 | 0.237 | 0.060 | torch.Size([120, 120]) || stage7.linear2.weight + | 0.023 | -0.390 | 0.506 | 0.167 | torch.Size([120]) || stage7.linear2.bias + | -0.000 | -0.104 | 0.107 | 0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight + | 0.002 | -0.041 | 0.035 | 0.016 | torch.Size([120]) || stage7.pa_deform.bias + | -0.000 | -0.123 | 0.109 | 0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight + | -0.002 | -0.034 | 0.032 | 0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias + | -0.001 | -0.111 | 0.084 | 0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight + | -0.008 | -0.073 | 0.081 | 0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias + | -0.002 | -0.154 | 0.122 | 0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight + | 0.014 | -0.041 | 0.068 | 0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias + | -0.001 | -0.408 | 0.365 | 0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight + | -0.003 | -0.057 | 0.054 | 0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias + | 0.000 | -0.697 | 0.606 | 0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight + | 0.119 | -0.211 | 0.720 | 0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias + | 0.000 | -1.175 | 0.924 | 0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight + | -0.000 | -0.581 | 0.580 | 0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias + | 0.001 | -0.786 | 0.874 | 0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight + | -0.053 | -0.522 | 0.577 | 0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias + | 1.225 | 1.000 | 1.516 | 0.095 | torch.Size([120]) || stage8.0.1.weight + | -0.013 | -0.413 | 0.465 | 0.139 | torch.Size([120]) || stage8.0.1.bias + | 0.000 | -2.505 | 0.627 | 0.136 | torch.Size([180, 120]) || stage8.0.2.weight + | 0.005 | -0.397 | 0.377 | 0.107 | torch.Size([180]) || stage8.0.2.bias + | 0.456 | 0.123 | 0.760 | 0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight + | -0.022 | -0.343 | 0.875 | 0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias + | -0.014 | -1.907 | 2.592 | 0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.632 | 0.628 | 0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight + | 0.006 | -0.567 | 0.668 | 0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.477 | 0.447 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight + | -0.010 | -0.460 | 0.225 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias + | 0.429 | 0.119 | 0.634 | 0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight + | -0.007 | -0.338 | 0.803 | 0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias + | -0.006 | -0.572 | 0.539 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight + | -0.060 | -0.260 | 0.185 | 0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.461 | 0.548 | 0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight + | 0.000 | -0.163 | 0.183 | 0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.757 | 0.581 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight + | -0.003 | -0.191 | 0.121 | 0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias + | 0.557 | 0.086 | 0.800 | 0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight + | -0.029 | -0.230 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias + | -0.016 | -2.004 | 1.711 | 0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.690 | 0.575 | 0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight + | 0.011 | -0.641 | 0.609 | 0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.466 | 0.401 | 0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight + | -0.008 | -0.344 | 0.181 | 0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias + | 0.503 | 0.226 | 0.742 | 0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight + | -0.009 | -0.404 | 0.818 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias + | -0.007 | -0.595 | 0.532 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight + | -0.068 | -0.261 | 0.071 | 0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.529 | 0.573 | 0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.129 | 0.197 | 0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.556 | 0.582 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight + | -0.003 | -0.170 | 0.145 | 0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias + | 0.699 | 0.202 | 0.912 | 0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight + | -0.033 | -0.253 | 0.924 | 0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias + | -0.030 | -2.510 | 2.088 | 0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.637 | 0.801 | 0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight + | 0.006 | -0.512 | 0.520 | 0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.381 | 0.337 | 0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight + | -0.011 | -0.238 | 0.234 | 0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias + | 0.594 | 0.150 | 0.810 | 0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight + | -0.010 | -0.483 | 0.726 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias + | -0.006 | -0.567 | 0.499 | 0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight + | -0.077 | -0.360 | 0.050 | 0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.536 | 0.673 | 0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.142 | 0.186 | 0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.536 | 0.524 | 0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight + | -0.006 | -0.147 | 0.133 | 0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias + | 0.683 | 0.141 | 0.908 | 0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight + | -0.033 | -0.199 | 0.878 | 0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias + | -0.039 | -1.527 | 3.891 | 0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.682 | 0.693 | 0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight + | 0.007 | -0.543 | 0.513 | 0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.390 | 0.476 | 0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight + | -0.007 | -0.176 | 0.150 | 0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias + | 0.640 | 0.094 | 0.853 | 0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight + | -0.009 | -0.372 | 0.683 | 0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias + | -0.006 | -0.628 | 0.521 | 0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight + | -0.089 | -0.367 | 0.047 | 0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.629 | 0.562 | 0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight + | -0.001 | -0.186 | 0.128 | 0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.485 | 0.499 | 0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.138 | 0.209 | 0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias + | 0.000 | -0.294 | 0.577 | 0.071 | torch.Size([180, 180]) || stage8.1.linear.weight + | 0.004 | -0.349 | 0.235 | 0.072 | torch.Size([180]) || stage8.1.linear.bias + | 0.708 | 0.242 | 1.026 | 0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight + | -0.032 | -0.212 | 0.830 | 0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias + | -0.039 | -1.954 | 2.394 | 0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.922 | 0.646 | 0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.429 | 0.524 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias + | -0.000 | -0.467 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight + | -0.005 | -0.339 | 0.264 | 0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias + | 0.587 | 0.255 | 0.837 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight + | -0.011 | -0.285 | 0.721 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias + | -0.006 | -0.586 | 0.534 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight + | -0.075 | -0.225 | 0.066 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.493 | 0.532 | 0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight + | 0.003 | -0.189 | 0.178 | 0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.551 | 0.543 | 0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight + | -0.010 | -0.154 | 0.142 | 0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias + | 0.773 | 0.210 | 1.004 | 0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight + | -0.035 | -0.176 | 0.873 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias + | -0.027 | -2.407 | 1.736 | 0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.817 | 0.977 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight + | 0.001 | -0.659 | 0.461 | 0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias + | 0.000 | -0.484 | 0.453 | 0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight + | -0.014 | -0.315 | 0.252 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias + | 0.641 | 0.337 | 0.810 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight + | -0.011 | -0.177 | 0.806 | 0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias + | -0.006 | -0.569 | 0.598 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight + | -0.079 | -0.323 | 0.071 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.512 | 0.577 | 0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight + | -0.003 | -0.142 | 0.161 | 0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.529 | 0.572 | 0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight + | -0.010 | -0.178 | 0.159 | 0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias + | 0.857 | 0.199 | 1.153 | 0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight + | -0.039 | -0.189 | 0.943 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias + | -0.042 | -1.962 | 2.773 | 0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.783 | 0.655 | 0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight + | 0.004 | -0.338 | 0.533 | 0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.497 | 0.461 | 0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight + | -0.008 | -0.288 | 0.183 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias + | 0.681 | 0.327 | 0.878 | 0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight + | -0.012 | -0.178 | 0.773 | 0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias + | -0.006 | -0.789 | 0.546 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight + | -0.081 | -0.249 | 0.036 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.526 | 0.555 | 0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight + | 0.000 | -0.133 | 0.191 | 0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.572 | 0.529 | 0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.164 | 0.147 | 0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias + | 0.877 | 0.198 | 1.043 | 0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight + | -0.038 | -0.210 | 0.916 | 0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias + | -0.094 | -2.974 | 4.987 | 0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.964 | 1.011 | 0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight + | -0.002 | -0.404 | 0.429 | 0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.501 | 0.489 | 0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight + | -0.021 | -0.305 | 0.208 | 0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias + | 0.697 | 0.295 | 0.894 | 0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight + | -0.015 | -0.241 | 0.712 | 0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias + | -0.005 | -0.562 | 0.573 | 0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight + | -0.085 | -0.302 | 0.080 | 0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.734 | 0.573 | 0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight + | 0.001 | -0.150 | 0.161 | 0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.671 | 0.623 | 0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight + | -0.023 | -0.252 | 0.317 | 0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.278 | 0.345 | 0.064 | torch.Size([180, 180]) || stage8.2.linear.weight + | 0.004 | -0.315 | 0.148 | 0.064 | torch.Size([180]) || stage8.2.linear.bias + | 0.850 | 0.326 | 1.087 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight + | -0.031 | -0.334 | 0.779 | 0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias + | -0.012 | -2.917 | 1.476 | 0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.603 | 0.666 | 0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight + | -0.001 | -0.374 | 0.381 | 0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.577 | 0.605 | 0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight + | -0.008 | -0.394 | 0.499 | 0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias + | 0.636 | 0.321 | 0.790 | 0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight + | -0.013 | -0.294 | 0.774 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias + | -0.004 | -0.540 | 0.539 | 0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight + | -0.065 | -0.212 | 0.047 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.608 | 0.603 | 0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight + | -0.002 | -0.177 | 0.155 | 0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.573 | 0.630 | 0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight + | -0.005 | -0.189 | 0.178 | 0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias + | 0.899 | 0.275 | 1.048 | 0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight + | -0.031 | -0.223 | 0.771 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias + | -0.003 | -3.151 | 1.718 | 0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index + | -0.000 | -0.732 | 0.868 | 0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.412 | 0.350 | 0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias + | 0.001 | -0.466 | 0.487 | 0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight + | -0.006 | -0.388 | 0.400 | 0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias + | 0.711 | 0.381 | 0.864 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight + | -0.009 | -0.240 | 0.692 | 0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias + | -0.005 | -0.657 | 0.639 | 0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight + | -0.077 | -0.263 | 0.047 | 0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.673 | 0.605 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight + | 0.002 | -0.158 | 0.155 | 0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias + | -0.000 | -0.582 | 0.585 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight + | -0.009 | -0.253 | 0.178 | 0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias + | 0.941 | 0.262 | 1.154 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight + | -0.032 | -0.162 | 0.906 | 0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias + | -0.005 | -3.421 | 1.350 | 0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.777 | 0.735 | 0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight + | 0.000 | -0.355 | 0.421 | 0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias + | 0.000 | -0.479 | 0.475 | 0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight + | -0.013 | -0.292 | 0.345 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias + | 0.743 | 0.242 | 0.919 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight + | -0.011 | -0.214 | 0.691 | 0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias + | -0.005 | -0.633 | 0.498 | 0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight + | -0.082 | -0.346 | 0.087 | 0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.591 | 0.670 | 0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight + | 0.001 | -0.190 | 0.151 | 0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.560 | 0.637 | 0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight + | -0.009 | -0.226 | 0.250 | 0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias + | 0.950 | 0.250 | 1.103 | 0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight + | -0.035 | -0.196 | 0.925 | 0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias + | -0.026 | -3.591 | 5.653 | 0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.753 | 0.637 | 0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight + | 0.000 | -0.333 | 0.432 | 0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias + | 0.001 | -0.591 | 0.591 | 0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight + | -0.014 | -0.348 | 0.267 | 0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias + | 0.735 | 0.254 | 0.893 | 0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight + | -0.011 | -0.241 | 0.659 | 0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias + | -0.005 | -0.628 | 0.667 | 0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight + | -0.076 | -0.411 | 0.113 | 0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.662 | 0.578 | 0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight + | -0.004 | -0.208 | 0.169 | 0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.602 | 0.588 | 0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight + | -0.011 | -0.218 | 0.232 | 0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias + | -0.000 | -0.343 | 0.316 | 0.065 | torch.Size([180, 180]) || stage8.3.linear.weight + | 0.010 | -0.297 | 0.187 | 0.061 | torch.Size([180]) || stage8.3.linear.bias + | 1.012 | 0.330 | 1.282 | 0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight + | -0.030 | -0.347 | 0.800 | 0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias + | -0.013 | -2.816 | 3.792 | 0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index + | -0.000 | -0.807 | 0.825 | 0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight + | -0.003 | -0.429 | 0.319 | 0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias + | 0.001 | -0.553 | 0.569 | 0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight + | -0.019 | -0.443 | 0.441 | 0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias + | 0.638 | 0.420 | 0.797 | 0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight + | -0.018 | -0.222 | 0.886 | 0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias + | -0.002 | -0.576 | 0.510 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight + | -0.018 | -0.277 | 0.123 | 0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias + | -0.000 | -0.687 | 0.625 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight + | -0.007 | -0.264 | 0.267 | 0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias + | 0.001 | -0.639 | 0.705 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight + | -0.012 | -0.255 | 0.274 | 0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias + | 1.092 | 0.475 | 1.341 | 0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight + | -0.030 | -0.294 | 0.686 | 0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias + | 0.018 | -3.165 | 0.990 | 0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.695 | 0.699 | 0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight + | 0.002 | -0.319 | 0.286 | 0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias + | -0.001 | -0.542 | 0.519 | 0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight + | -0.017 | -0.439 | 0.451 | 0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias + | 0.664 | 0.366 | 0.835 | 0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight + | -0.015 | -0.217 | 0.985 | 0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias + | -0.002 | -0.641 | 0.563 | 0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight + | -0.022 | -0.381 | 0.161 | 0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.571 | 0.642 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight + | 0.003 | -0.279 | 0.311 | 0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.738 | 0.633 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight + | -0.007 | -0.254 | 0.261 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias + | 1.125 | 0.525 | 1.405 | 0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight + | -0.033 | -0.186 | 0.627 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias + | 0.028 | -3.477 | 0.957 | 0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.663 | 0.658 | 0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight + | -0.007 | -0.357 | 0.255 | 0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias + | -0.000 | -0.596 | 0.578 | 0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight + | -0.018 | -0.506 | 0.389 | 0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias + | 0.694 | 0.319 | 0.865 | 0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight + | -0.018 | -0.150 | 0.975 | 0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias + | -0.002 | -0.619 | 0.565 | 0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight + | -0.025 | -0.345 | 0.208 | 0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias + | -0.000 | -0.624 | 0.607 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight + | -0.003 | -0.388 | 0.290 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias + | -0.000 | -0.927 | 0.675 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.325 | 0.240 | 0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias + | 1.108 | 0.535 | 1.297 | 0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight + | -0.035 | -0.213 | 0.546 | 0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias + | 0.020 | -3.042 | 1.420 | 0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table + | 1237.000 | 0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index + | -0.000 | -0.697 | 0.700 | 0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight + | -0.000 | -0.220 | 0.311 | 0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.652 | 0.592 | 0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight + | -0.019 | -0.535 | 0.426 | 0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias + | 0.685 | 0.225 | 0.893 | 0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight + | -0.023 | -0.211 | 0.938 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias + | -0.001 | -0.501 | 0.564 | 0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight + | -0.014 | -0.339 | 0.237 | 0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.560 | 0.626 | 0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight + | 0.000 | -0.231 | 0.239 | 0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias + | -0.000 | -0.544 | 0.657 | 0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight + | -0.007 | -0.271 | 0.274 | 0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.473 | 0.481 | 0.069 | torch.Size([180, 180]) || stage8.4.linear.weight + | 0.029 | -0.333 | 0.194 | 0.076 | torch.Size([180]) || stage8.4.linear.bias + | 1.025 | 0.297 | 1.336 | 0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight + | -0.034 | -0.429 | 0.872 | 0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias + | -0.574 | -4.515 | 3.381 | 0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.771 | 0.886 | 0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight + | 0.000 | -0.356 | 0.521 | 0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias + | -0.001 | -0.632 | 0.656 | 0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight + | -0.029 | -0.329 | 0.697 | 0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias + | 0.777 | 0.446 | 0.952 | 0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight + | -0.022 | -0.335 | 0.920 | 0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias + | -0.002 | -0.520 | 0.598 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight + | -0.013 | -0.456 | 0.200 | 0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.677 | 0.642 | 0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight + | 0.005 | -0.272 | 0.233 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias + | 0.000 | -0.762 | 0.598 | 0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight + | -0.025 | -0.244 | 0.583 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias + | 1.021 | 0.261 | 1.261 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight + | -0.033 | -0.358 | 0.867 | 0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias + | -0.550 | -3.274 | 4.406 | 0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.819 | 0.986 | 0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight + | 0.005 | -0.510 | 0.446 | 0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias + | -0.003 | -0.739 | 0.682 | 0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight + | -0.032 | -0.318 | 0.607 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias + | 0.823 | 0.420 | 0.950 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight + | -0.021 | -0.274 | 0.882 | 0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias + | -0.002 | -0.496 | 0.532 | 0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight + | -0.028 | -0.260 | 0.194 | 0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias + | 0.000 | -0.620 | 0.586 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight + | 0.004 | -0.284 | 0.423 | 0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.774 | 0.614 | 0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight + | -0.028 | -0.371 | 0.561 | 0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias + | 1.096 | 0.377 | 1.321 | 0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight + | -0.033 | -0.244 | 0.755 | 0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias + | -0.441 | -3.439 | 5.870 | 0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index + | -0.000 | -0.710 | 0.679 | 0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight + | 0.003 | -0.277 | 0.283 | 0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias + | 0.001 | -0.824 | 0.684 | 0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight + | -0.033 | -0.390 | 0.545 | 0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias + | 0.843 | 0.390 | 0.984 | 0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight + | -0.022 | -0.211 | 0.854 | 0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias + | -0.002 | -0.522 | 0.503 | 0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight + | -0.024 | -0.243 | 0.219 | 0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias + | -0.001 | -0.638 | 0.617 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight + | -0.004 | -0.268 | 0.380 | 0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias + | 0.000 | -0.713 | 0.769 | 0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight + | -0.034 | -0.372 | 0.592 | 0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias + | 1.027 | 0.318 | 1.206 | 0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight + | -0.033 | -0.187 | 0.768 | 0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias + | -0.347 | -2.664 | 2.684 | 0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.677 | 0.676 | 0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight + | 0.002 | -0.410 | 0.354 | 0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias + | 0.000 | -0.630 | 0.725 | 0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight + | -0.041 | -0.385 | 0.660 | 0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias + | 0.849 | 0.390 | 0.985 | 0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight + | -0.023 | -0.163 | 0.810 | 0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias + | -0.002 | -0.547 | 0.536 | 0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight + | -0.012 | -0.366 | 0.252 | 0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias + | -0.000 | -0.669 | 0.597 | 0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight + | -0.002 | -0.216 | 0.202 | 0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.700 | 0.674 | 0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight + | -0.032 | -0.376 | 0.666 | 0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias + | -0.001 | -0.299 | 0.469 | 0.069 | torch.Size([180, 180]) || stage8.5.linear.weight + | 0.081 | -0.562 | 0.263 | 0.109 | torch.Size([180]) || stage8.5.linear.bias + | 1.111 | 0.208 | 1.434 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight + | -0.048 | -0.547 | 0.851 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias + | -0.252 | -2.157 | 6.293 | 0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index + | 0.000 | -0.664 | 0.631 | 0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight + | 0.007 | -0.293 | 0.366 | 0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias + | 0.000 | -0.701 | 0.726 | 0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight + | 0.030 | -0.318 | 0.331 | 0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias + | 0.959 | 0.475 | 1.322 | 0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight + | -0.039 | -0.421 | 0.873 | 0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias + | -0.002 | -0.550 | 0.783 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight + | 0.002 | -0.269 | 0.152 | 0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias + | 0.000 | -0.914 | 0.839 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight + | 0.001 | -0.340 | 0.304 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias + | -0.000 | -0.592 | 0.713 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight + | 0.002 | -0.535 | 0.384 | 0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias + | 1.123 | 0.183 | 1.352 | 0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight + | -0.047 | -0.513 | 0.903 | 0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias + | -0.234 | -1.968 | 6.366 | 0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index + | 0.000 | -0.751 | 0.759 | 0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight + | -0.001 | -0.300 | 0.214 | 0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias + | -0.000 | -0.657 | 0.699 | 0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight + | 0.031 | -0.321 | 0.293 | 0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias + | 0.986 | 0.416 | 1.360 | 0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight + | -0.038 | -0.393 | 0.807 | 0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias + | -0.001 | -0.589 | 0.620 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight + | 0.005 | -0.316 | 0.229 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias + | -0.000 | -0.738 | 0.766 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight + | 0.001 | -0.252 | 0.302 | 0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias + | 0.000 | -0.674 | 0.629 | 0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight + | -0.001 | -0.475 | 0.441 | 0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias + | 1.097 | 0.342 | 1.294 | 0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight + | -0.054 | -0.639 | 0.904 | 0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias + | -0.135 | -3.252 | 1.238 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index + | 0.000 | -0.672 | 0.663 | 0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight + | 0.007 | -0.170 | 0.228 | 0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias + | -0.001 | -0.660 | 0.651 | 0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight + | 0.031 | -0.360 | 0.322 | 0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias + | 1.004 | 0.360 | 1.381 | 0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight + | -0.042 | -0.447 | 0.808 | 0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias + | -0.000 | -0.600 | 0.603 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight + | 0.022 | -0.447 | 0.249 | 0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias + | 0.000 | -0.666 | 0.708 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight + | -0.002 | -0.326 | 0.272 | 0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias + | -0.001 | -0.653 | 0.719 | 0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight + | -0.011 | -0.488 | 0.321 | 0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias + | 1.095 | 0.272 | 1.302 | 0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight + | -0.052 | -0.557 | 1.069 | 0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias + | -0.196 | -2.349 | 1.401 | 0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table + | 112.000 | 0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index + | 0.000 | -0.741 | 0.657 | 0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight + | 0.001 | -0.186 | 0.141 | 0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias + | -0.001 | -0.669 | 0.671 | 0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight + | -0.004 | -0.323 | 0.300 | 0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias + | 0.999 | 0.383 | 1.380 | 0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight + | -0.044 | -0.392 | 0.694 | 0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias + | 0.000 | -0.577 | 0.857 | 0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight + | 0.041 | -0.394 | 0.238 | 0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias + | 0.000 | -0.924 | 0.828 | 0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight + | -0.003 | -0.214 | 0.407 | 0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias + | 0.000 | -0.827 | 0.755 | 0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight + | 0.022 | -0.296 | 0.262 | 0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias + | 0.002 | -1.059 | 1.262 | 0.089 | torch.Size([180, 180]) || stage8.6.linear.weight + | 0.031 | -0.789 | 0.427 | 0.120 | torch.Size([180]) || stage8.6.linear.bias + | 0.389 | 0.079 | 1.137 | 0.176 | torch.Size([180]) || norm.weight + | -0.021 | -0.669 | 0.888 | 0.127 | torch.Size([180]) || norm.bias + | 0.000 | -0.486 | 0.568 | 0.103 | torch.Size([120, 180]) || conv_after_body.weight + | -0.000 | -0.167 | 0.168 | 0.055 | torch.Size([120]) || conv_after_body.bias + | -0.000 | -1.782 | 1.300 | 0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight + | -0.019 | -0.542 | 0.437 | 0.162 | torch.Size([64]) || conv_before_upsample.0.bias + | 0.001 | -1.915 | 1.372 | 0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight + | -0.045 | -0.281 | 0.215 | 0.097 | torch.Size([256]) || upsample.0.bias + | -0.006 | -4.826 | 0.582 | 0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight + | -0.154 | -0.441 | 0.187 | 0.100 | torch.Size([256]) || upsample.5.bias + | 0.000 | -0.210 | 0.246 | 0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight + | 0.000 | -0.013 | 0.007 | 0.003 | torch.Size([64]) || upsample.10.bias + | 0.000 | -0.044 | 0.042 | 0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight + | 0.000 | 0.000 | 0.000 | 0.000 | torch.Size([3]) || conv_last.bias + +22-03-11 10:53:40.924 : task: 001_train_vrt_videosr_bi_reds_6frames + model: vrt + gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] + dist: False + find_unused_parameters: False + use_static_graph: True + scale: 4 + n_channels: 3 + path:[ + root: experiments + pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth + pretrained_netE: None + task: experiments/001_train_vrt_videosr_bi_reds_6frames + log: experiments/001_train_vrt_videosr_bi_reds_6frames + options: experiments/001_train_vrt_videosr_bi_reds_6frames/options + models: experiments/001_train_vrt_videosr_bi_reds_6frames/models + images: experiments/001_train_vrt_videosr_bi_reds_6frames/images + pretrained_optimizerG: None + ] + datasets:[ + train:[ + name: train_dataset + dataset_type: VideoRecurrentTrainDataset + dataroot_gt: /home/cll/datasets/REDS/train/train_sharp + dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4 + meta_info_file: data/meta_info/meta_info_REDS_GT.txt + filename_tmpl: 08d + filename_ext: png + val_partition: REDS4 + test_mode: False + io_backend:[ + type: disk + ] + num_frame: 4 + gt_size: 256 + interval_list: [1] + random_reverse: False + use_hflip: True + use_rot: True + dataloader_shuffle: True + dataloader_num_workers: 32 + dataloader_batch_size: 8 + phase: train + scale: 4 + n_channels: 3 + ] + test:[ + name: test_dataset + dataset_type: VideoRecurrentTestDataset + dataroot_gt: /home/cll/Desktop/REDS4/GT + dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic + cache_data: True + io_backend:[ + type: disk + ] + num_frame: -1 + phase: test + scale: 4 + n_channels: 3 + ] + ] + netG:[ + net_type: vrt + upscale: 4 + img_size: [6, 64, 64] + window_size: [2, 8, 8] + depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4] + indep_reconsts: [11, 12] + embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180] + num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth + pa_frames: 2 + deformable_groups: 12 + nonblind_denoising: False + use_checkpoint_attn: False + use_checkpoint_ffn: False + no_checkpoint_attn_blocks: [] + no_checkpoint_ffn_blocks: [] + init_type: default + scale: 4 + ] + train:[ + G_lossfn_type: charbonnier + G_lossfn_weight: 1.0 + G_charbonnier_eps: 1e-09 + E_decay: 0 + G_optimizer_type: adam + G_optimizer_lr: 0.0004 + G_optimizer_betas: [0.9, 0.99] + G_optimizer_wd: 0 + G_optimizer_clipgrad: None + G_optimizer_reuse: True + fix_iter: 20000 + fix_lr_mul: 0.125 + fix_keys: ['spynet', 'deform'] + total_iter: 300000 + G_scheduler_type: CosineAnnealingWarmRestarts + G_scheduler_periods: 300000 + G_scheduler_eta_min: 1e-07 + G_regularizer_orthstep: None + G_regularizer_clipstep: None + G_param_strict: True + E_param_strict: True + checkpoint_test: 5000 + checkpoint_save: 5000 + checkpoint_print: 200 + F_feature_layer: 34 + F_weights: 1.0 + F_lossfn_type: l1 + F_use_input_norm: True + F_use_range_norm: False + G_scheduler_restart_weights: 1 + ] + val:[ + save_img: False + pad_seq: False + flip_seq: False + center_frame_only: False + num_frame_testing: 40 + num_frame_overlapping: 2 + size_patch_testing: 128 + ] + opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json + is_train: True + merge_bn: False + merge_bn_startpoint: -1 + num_gpu: 8 + rank: 0 + world_size: 1 + +22-03-11 10:53:40.969 : Number of train images: 24,000, iters: 3,000