diff --git "a/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log" "b/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log"
new file mode 100644--- /dev/null
+++ "b/KAIR/experiments/001_train_vrt_videosr_bi_reds_6frames/train.log"
@@ -0,0 +1,22331 @@
+22-03-11 09:54:38.123 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: None
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb
+      dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: lmdb
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: testsets/REDS4/GT
+      dataroot_lq: testsets/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 09:54:38.147 : Number of train images: 27,000, iters: 3,375
+22-03-11 09:54:50.175 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: None
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb
+      dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: lmdb
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: testsets/REDS4/GT
+      dataroot_lq: testsets/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 09:54:50.223 : Number of train images: 27,000, iters: 3,375
+22-03-11 09:54:57.597 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 09:54:57.779 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ |  0.000 | -0.064 |  0.064 |  0.037 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ | -0.005 | -0.063 |  0.062 |  0.037 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.684 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.055 | -0.917 |  0.306 |  0.335 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.009 | -3.201 |  0.948 |  0.096 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.039 | -1.273 |  0.675 |  0.311 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.690 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.162 | -0.704 |  0.905 |  0.366 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.023 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.787 | -1.061 |  1.170 |  0.522 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.145 |  0.166 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ | -0.000 | -0.001 |  0.000 |  0.001 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ | -0.000 | -0.726 |  0.782 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.024 | -0.810 |  0.352 |  0.313 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.008 | -3.370 |  0.914 |  0.098 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.042 | -1.197 |  0.699 |  0.302 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.468 |  0.566 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.160 | -0.745 |  0.996 |  0.391 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.017 | -1.648 |  0.317 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.785 | -1.176 |  1.158 |  0.543 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ |  0.000 | -0.145 |  0.163 |  0.014 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -1.003 |  0.875 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.021 | -0.979 |  0.466 |  0.373 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.008 | -4.622 |  1.220 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.028 | -1.276 |  0.717 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.007 | -1.827 |  0.624 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.123 | -0.697 |  0.745 |  0.334 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.010 | -1.295 |  0.330 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.677 | -1.696 |  0.934 |  0.637 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.114 |  0.129 |  0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.003 | -0.008 |  0.002 |  0.007 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ |  0.000 | -1.053 |  0.952 |  0.091 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.016 | -1.061 |  0.522 |  0.414 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.008 | -4.891 |  1.222 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.029 | -1.264 |  0.760 |  0.309 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.007 | -1.792 |  0.579 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.117 | -0.694 |  0.670 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.008 | -1.108 |  0.324 |  0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.652 | -1.754 |  0.901 |  0.647 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.117 |  0.129 |  0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.002 | -0.003 |  0.007 |  0.007 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -1.085 |  0.998 |  0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ |  0.009 | -0.975 |  0.477 |  0.368 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.008 | -5.056 |  1.282 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.029 | -1.240 |  0.796 |  0.311 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.007 | -1.772 |  0.600 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.121 | -0.688 |  0.694 |  0.331 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.007 | -0.980 |  0.320 |  0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.642 | -1.810 |  0.912 |  0.662 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.188 |  0.209 |  0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.002 | -0.008 |  0.005 |  0.009 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -1.085 |  0.999 |  0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.009 | -0.982 |  0.474 |  0.368 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.008 | -5.089 |  1.311 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.029 | -1.256 |  0.804 |  0.314 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.788 |  0.613 |  0.093 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.122 | -0.699 |  0.700 |  0.334 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.008 | -1.010 |  0.323 |  0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.650 | -1.834 |  0.923 |  0.670 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.192 |  0.213 |  0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ | -0.001 | -0.007 |  0.005 |  0.009 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.reshape.1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.065 |  0.069 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.090 |  0.091 |  0.050 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ |  0.005 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.004 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.050 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.004 | -0.089 |  0.088 |  0.052 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.070 |  0.070 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ | -0.001 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.091 |  0.089 |  0.051 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.072 |  0.073 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.038 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ | -0.004 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.091 |  0.089 |  0.051 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.062 |  0.064 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.067 |  0.067 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.008 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.005 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.005 | -0.063 |  0.061 |  0.035 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.079 |  0.068 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ |  0.003 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.006 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.006 | -0.087 |  0.091 |  0.050 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.077 |  0.071 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ | -0.004 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.000 | -0.089 |  0.089 |  0.050 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.004 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.063 |  0.034 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.linear1.weight
+ | -0.010 | -0.090 |  0.091 |  0.050 | torch.Size([120]) || stage1.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.079 |  0.088 |  0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.091 |  0.091 |  0.050 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.090 |  0.090 |  0.054 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.089 |  0.091 |  0.054 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.078 |  0.083 |  0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.003 | -0.088 |  0.089 |  0.052 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.000 | -0.091 |  0.091 |  0.048 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ |  0.000 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ |  0.002 | -0.052 |  0.053 |  0.030 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ | -0.001 | -0.053 |  0.053 |  0.031 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ |  0.002 | -0.052 |  0.052 |  0.030 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage2.reshape.1.bias
+ |  0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ | -0.001 | -0.045 |  0.045 |  0.026 | torch.Size([120]) || stage2.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.070 |  0.065 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ |  0.003 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.004 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.005 | -0.090 |  0.089 |  0.055 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.066 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ | -0.002 | -0.064 |  0.060 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.088 |  0.054 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.004 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.007 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.068 |  0.075 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.000 | -0.063 |  0.063 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.004 | -0.091 |  0.091 |  0.050 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.008 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.006 | -0.063 |  0.065 |  0.038 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.095 |  0.063 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.007 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.003 | -0.090 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.003 | -0.089 |  0.090 |  0.050 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.070 |  0.081 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ |  0.000 | -0.061 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.090 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.003 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.003 | -0.088 |  0.091 |  0.051 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.000 | -0.064 |  0.062 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.072 |  0.077 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.005 | -0.091 |  0.089 |  0.053 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ | -0.000 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.089 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.000 | -0.063 |  0.065 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.linear1.weight
+ | -0.003 | -0.090 |  0.089 |  0.054 | torch.Size([120]) || stage2.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.077 |  0.106 |  0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.091 |  0.091 |  0.050 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ |  0.005 | -0.090 |  0.090 |  0.050 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.062 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.077 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ |  0.013 | -0.088 |  0.090 |  0.051 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.002 | -0.090 |  0.091 |  0.051 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.005 | -0.063 |  0.063 |  0.038 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.linear2.weight
+ | -0.000 | -0.088 |  0.090 |  0.053 | torch.Size([120]) || stage2.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ |  0.002 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ |  0.002 | -0.027 |  0.030 |  0.016 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ |  0.002 | -0.053 |  0.053 |  0.031 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.001 | -0.053 |  0.052 |  0.030 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.031 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ | -0.002 | -0.052 |  0.052 |  0.030 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage3.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage3.reshape.1.bias
+ |  0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.001 | -0.045 |  0.045 |  0.027 | torch.Size([120]) || stage3.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.072 |  0.071 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.070 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.003 | -0.060 |  0.064 |  0.035 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.004 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.090 |  0.089 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.076 |  0.074 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.005 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.001 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.007 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.002 | -0.062 |  0.064 |  0.038 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.073 |  0.065 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.006 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.063 |  0.063 |  0.035 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.088 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.001 | -0.065 |  0.064 |  0.040 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.080 |  0.063 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ |  0.001 | -0.064 |  0.062 |  0.040 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.007 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.062 |  0.063 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.069 |  0.079 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ |  0.005 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.002 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.005 | -0.090 |  0.090 |  0.055 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.000 | -0.091 |  0.089 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.004 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([120, 120]) || stage3.linear1.weight
+ |  0.003 | -0.091 |  0.091 |  0.054 | torch.Size([120]) || stage3.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.077 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ | -0.011 | -0.091 |  0.091 |  0.053 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.008 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.004 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.088 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ | -0.003 | -0.091 |  0.089 |  0.054 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.090 |  0.090 |  0.054 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.089 |  0.091 |  0.051 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.002 | -0.061 |  0.062 |  0.034 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.002 | -0.089 |  0.091 |  0.048 | torch.Size([120]) || stage3.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ |  0.000 | -0.021 |  0.021 |  0.011 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ | -0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ | -0.001 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ | -0.002 | -0.053 |  0.053 |  0.029 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.005 | -0.053 |  0.052 |  0.030 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ |  0.007 | -0.052 |  0.053 |  0.029 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage4.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage4.reshape.1.bias
+ | -0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ | -0.002 | -0.046 |  0.045 |  0.027 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.065 |  0.070 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.004 | -0.091 |  0.090 |  0.055 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.073 |  0.086 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ |  0.003 | -0.065 |  0.063 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.004 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.089 |  0.051 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.089 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.064 |  0.069 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ | -0.004 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.002 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.006 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.003 | -0.065 |  0.064 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.067 |  0.074 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.064 |  0.064 |  0.042 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.089 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.006 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.074 |  0.077 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.004 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.003 | -0.061 |  0.064 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.090 |  0.089 |  0.050 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.002 | -0.065 |  0.063 |  0.035 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.076 |  0.074 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ | -0.001 | -0.063 |  0.064 |  0.036 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.005 | -0.091 |  0.091 |  0.053 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.066 |  0.086 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ | -0.005 | -0.089 |  0.084 |  0.053 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.003 | -0.090 |  0.090 |  0.051 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.090 |  0.089 |  0.054 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.062 |  0.037 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.074 |  0.082 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.004 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.091 |  0.091 |  0.055 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.090 |  0.056 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.062 |  0.036 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.linear2.weight
+ |  0.006 | -0.091 |  0.090 |  0.057 | torch.Size([120]) || stage4.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ | -0.000 | -0.020 |  0.021 |  0.011 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ | -0.003 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ | -0.001 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ |  0.000 | -0.052 |  0.053 |  0.029 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ | -0.001 | -0.052 |  0.053 |  0.029 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ | -0.002 | -0.053 |  0.051 |  0.029 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage5.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage5.reshape.1.bias
+ | -0.002 | -0.183 |  0.182 |  0.105 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ |  0.014 | -0.182 |  0.181 |  0.113 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.073 |  0.066 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.090 |  0.090 |  0.050 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ |  0.006 | -0.062 |  0.064 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.064 |  0.063 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.073 |  0.082 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ |  0.002 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.003 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.063 |  0.062 |  0.036 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.086 |  0.069 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.063 |  0.064 |  0.040 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.004 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ |  0.005 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.064 |  0.063 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.070 |  0.068 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.003 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.003 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.049 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.068 |  0.077 |  0.019 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.003 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.063 |  0.063 |  0.040 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.068 |  0.075 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.003 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ |  0.001 | -0.063 |  0.063 |  0.034 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.002 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.057 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.001 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.061 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.linear1.weight
+ |  0.002 | -0.089 |  0.091 |  0.052 | torch.Size([120]) || stage5.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.079 |  0.089 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.002 | -0.090 |  0.090 |  0.049 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ |  0.000 | -0.091 |  0.090 |  0.049 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.089 |  0.056 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.006 | -0.062 |  0.062 |  0.036 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.077 |  0.082 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ | -0.007 | -0.090 |  0.091 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.062 |  0.037 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.linear2.weight
+ |  0.006 | -0.089 |  0.091 |  0.053 | torch.Size([120]) || stage5.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ | -0.002 | -0.030 |  0.029 |  0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ | -0.003 | -0.029 |  0.030 |  0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.002 | -0.052 |  0.052 |  0.030 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ |  0.003 | -0.053 |  0.052 |  0.032 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.001 | -0.050 |  0.051 |  0.030 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage6.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage6.reshape.1.bias
+ | -0.002 | -0.183 |  0.183 |  0.107 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ | -0.007 | -0.178 |  0.182 |  0.107 | torch.Size([120]) || stage6.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.073 |  0.070 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.002 | -0.089 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.005 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.065 |  0.064 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.068 |  0.071 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ | -0.005 | -0.064 |  0.061 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.090 |  0.048 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.002 | -0.063 |  0.064 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.065 |  0.067 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.004 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.005 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.005 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.062 |  0.064 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.068 |  0.077 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.004 | -0.090 |  0.091 |  0.050 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.063 |  0.063 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.008 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.089 |  0.089 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.005 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.086 |  0.071 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.004 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.008 | -0.088 |  0.091 |  0.055 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.074 |  0.065 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ |  0.001 | -0.065 |  0.063 |  0.039 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.005 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.000 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.linear1.weight
+ |  0.001 | -0.091 |  0.090 |  0.051 | torch.Size([120]) || stage6.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.075 |  0.086 |  0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.090 |  0.090 |  0.053 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.079 |  0.081 |  0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ |  0.005 | -0.089 |  0.090 |  0.054 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.090 |  0.090 |  0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.linear2.weight
+ | -0.004 | -0.091 |  0.091 |  0.051 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ |  0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.004 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ | -0.000 | -0.053 |  0.052 |  0.032 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.005 | -0.051 |  0.052 |  0.030 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage7.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage7.reshape.1.bias
+ | -0.001 | -0.182 |  0.182 |  0.106 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ |  0.005 | -0.178 |  0.181 |  0.109 | torch.Size([120]) || stage7.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.064 |  0.075 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.090 |  0.051 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ |  0.002 | -0.063 |  0.064 |  0.040 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.062 |  0.038 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.075 |  0.075 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ |  0.001 | -0.063 |  0.064 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.005 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.062 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.063 |  0.092 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.004 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ | -0.000 | -0.064 |  0.062 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.091 |  0.089 |  0.055 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.083 |  0.079 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.090 |  0.051 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ | -0.001 | -0.062 |  0.064 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.003 | -0.061 |  0.064 |  0.035 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.077 |  0.084 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ | -0.005 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.001 | -0.089 |  0.090 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.063 |  0.062 |  0.034 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.071 |  0.078 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.004 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.011 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.090 |  0.050 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.004 | -0.090 |  0.090 |  0.051 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.062 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.linear1.weight
+ | -0.005 | -0.089 |  0.090 |  0.055 | torch.Size([120]) || stage7.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.077 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.090 |  0.091 |  0.053 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.060 |  0.062 |  0.036 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.086 |  0.077 |  0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.089 |  0.089 |  0.053 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([120, 120]) || stage7.linear2.weight
+ | -0.007 | -0.090 |  0.090 |  0.051 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ |  0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ | -0.001 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.030 |  0.028 |  0.017 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ |  0.000 | -0.053 |  0.052 |  0.031 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ |  0.002 | -0.052 |  0.053 |  0.029 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage8.0.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage8.0.1.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([180, 120]) || stage8.0.2.weight
+ |  0.005 | -0.090 |  0.090 |  0.050 | torch.Size([180]) || stage8.0.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ |  0.000 | -0.078 |  0.076 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ |  0.003 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.075 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ | -0.000 | -0.078 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ |  0.003 | -0.073 |  0.074 |  0.045 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.075 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.033 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.081 |  0.076 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ | -0.004 | -0.074 |  0.074 |  0.041 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.031 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.084 |  0.092 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.075 |  0.044 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ | -0.003 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ | -0.003 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.linear.weight
+ |  0.002 | -0.073 |  0.074 |  0.043 | torch.Size([180]) || stage8.1.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.077 |  0.071 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ | -0.002 | -0.073 |  0.074 |  0.044 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.075 |  0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.000 | -0.051 |  0.053 |  0.029 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ | -0.000 | -0.081 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ |  0.004 | -0.073 |  0.074 |  0.043 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ |  0.002 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.081 |  0.071 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.074 |  0.073 |  0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ |  0.001 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.075 |  0.074 |  0.045 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.052 |  0.051 |  0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.075 |  0.073 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ |  0.000 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ | -0.005 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.000 | -0.074 |  0.073 |  0.044 | torch.Size([180]) || stage8.2.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.083 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.005 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ |  0.004 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ | -0.003 | -0.073 |  0.074 |  0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ |  0.004 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ | -0.000 | -0.073 |  0.087 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ | -0.002 | -0.074 |  0.073 |  0.042 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.075 |  0.075 |  0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.002 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ |  0.000 | -0.085 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.003 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ |  0.000 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.075 |  0.045 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.051 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.081 |  0.082 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.075 |  0.074 |  0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ | -0.001 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ |  0.003 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.074 |  0.075 |  0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.linear.weight
+ | -0.001 | -0.073 |  0.074 |  0.042 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.082 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.002 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ |  0.004 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.050 |  0.052 |  0.029 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.083 |  0.083 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.074 |  0.073 |  0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ |  0.005 | -0.073 |  0.072 |  0.041 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ |  0.003 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ |  0.003 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.075 |  0.081 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ | -0.002 | -0.075 |  0.074 |  0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.053 |  0.052 |  0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ | -0.000 | -0.083 |  0.072 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.004 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ |  0.004 | -0.074 |  0.072 |  0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ |  0.007 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.073 |  0.075 |  0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.052 |  0.053 |  0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.linear.weight
+ | -0.008 | -0.075 |  0.072 |  0.039 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.058 |  0.058 |  0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.001 | -0.073 |  0.075 |  0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.051 |  0.051 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ | -0.000 | -0.063 |  0.060 |  0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ |  0.001 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.072 |  0.073 |  0.041 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.062 |  0.058 |  0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.075 |  0.074 |  0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.001 | -0.073 |  0.074 |  0.042 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ |  0.005 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ | -0.000 | -0.074 |  0.073 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ |  0.005 | -0.050 |  0.053 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ |  0.001 | -0.063 |  0.061 |  0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.004 | -0.074 |  0.075 |  0.042 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ |  0.004 | -0.074 |  0.074 |  0.040 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ |  0.001 | -0.075 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.linear.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.064 |  0.077 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.075 |  0.074 |  0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ |  0.002 | -0.073 |  0.074 |  0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.051 |  0.052 |  0.032 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.074 |  0.067 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.041 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ | -0.000 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.075 |  0.074 |  0.042 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.031 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.052 |  0.053 |  0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ |  0.001 | -0.071 |  0.075 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.075 |  0.074 |  0.044 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.002 | -0.073 |  0.074 |  0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.004 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.074 |  0.074 |  0.041 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ | -0.003 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ | -0.000 | -0.060 |  0.066 |  0.021 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.002 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.003 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.075 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.linear.weight
+ | -0.009 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.6.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || norm.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || norm.bias
+ | -0.001 | -0.075 |  0.075 |  0.043 | torch.Size([120, 180]) || conv_after_body.weight
+ | -0.002 | -0.074 |  0.074 |  0.044 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ |  0.000 | -0.029 |  0.030 |  0.016 | torch.Size([64]) || conv_before_upsample.0.bias
+ | -0.000 | -0.042 |  0.042 |  0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ |  0.000 | -0.041 |  0.042 |  0.024 | torch.Size([256]) || upsample.0.bias
+ | -0.000 | -0.042 |  0.042 |  0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ |  0.000 | -0.041 |  0.040 |  0.025 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.042 |  0.042 |  0.024 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.003 | -0.041 |  0.041 |  0.025 | torch.Size([64]) || upsample.10.bias
+ | -0.000 | -0.042 |  0.042 |  0.024 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.001 | -0.039 |  0.037 |  0.038 | torch.Size([3]) || conv_last.bias
+
+22-03-11 09:55:18.025 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: None
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: trainsets/REDS/train_sharp_with_val.lmdb
+      dataroot_lq: trainsets/REDS/train_sharp_bicubic_with_val.lmdb
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: lmdb
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: testsets/REDS4/GT
+      dataroot_lq: testsets/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 09:55:18.071 : Number of train images: 27,000, iters: 3,375
+22-03-11 09:55:21.359 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 09:55:21.536 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ |  0.000 | -0.064 |  0.064 |  0.037 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ |  0.000 | -0.062 |  0.064 |  0.037 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.684 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.055 | -0.917 |  0.306 |  0.335 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.009 | -3.201 |  0.948 |  0.096 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.039 | -1.273 |  0.675 |  0.311 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.690 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.162 | -0.704 |  0.905 |  0.366 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.023 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.787 | -1.061 |  1.170 |  0.522 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.145 |  0.166 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ | -0.000 | -0.001 |  0.000 |  0.001 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ | -0.000 | -0.726 |  0.782 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.024 | -0.810 |  0.352 |  0.313 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.008 | -3.370 |  0.914 |  0.098 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.042 | -1.197 |  0.699 |  0.302 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.468 |  0.566 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.160 | -0.745 |  0.996 |  0.391 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.017 | -1.648 |  0.317 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.785 | -1.176 |  1.158 |  0.543 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ |  0.000 | -0.145 |  0.163 |  0.014 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -1.003 |  0.875 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.021 | -0.979 |  0.466 |  0.373 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.008 | -4.622 |  1.220 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.028 | -1.276 |  0.717 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.007 | -1.827 |  0.624 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.123 | -0.697 |  0.745 |  0.334 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.010 | -1.295 |  0.330 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.677 | -1.696 |  0.934 |  0.637 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.114 |  0.129 |  0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.003 | -0.008 |  0.002 |  0.007 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ |  0.000 | -1.053 |  0.952 |  0.091 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.016 | -1.061 |  0.522 |  0.414 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.008 | -4.891 |  1.222 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.029 | -1.264 |  0.760 |  0.309 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.007 | -1.792 |  0.579 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.117 | -0.694 |  0.670 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.008 | -1.108 |  0.324 |  0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.652 | -1.754 |  0.901 |  0.647 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.117 |  0.129 |  0.008 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.002 | -0.003 |  0.007 |  0.007 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -1.085 |  0.998 |  0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ |  0.009 | -0.975 |  0.477 |  0.368 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.008 | -5.056 |  1.282 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.029 | -1.240 |  0.796 |  0.311 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.007 | -1.772 |  0.600 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.121 | -0.688 |  0.694 |  0.331 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.007 | -0.980 |  0.320 |  0.065 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.642 | -1.810 |  0.912 |  0.662 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.188 |  0.209 |  0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.002 | -0.008 |  0.005 |  0.009 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -1.085 |  0.999 |  0.092 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.009 | -0.982 |  0.474 |  0.368 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.008 | -5.089 |  1.311 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.029 | -1.256 |  0.804 |  0.314 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.788 |  0.613 |  0.093 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.122 | -0.699 |  0.700 |  0.334 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.008 | -1.010 |  0.323 |  0.067 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.650 | -1.834 |  0.923 |  0.670 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.192 |  0.213 |  0.011 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ | -0.001 | -0.007 |  0.005 |  0.009 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.reshape.1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.069 |  0.063 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.063 |  0.065 |  0.035 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.003 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.004 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.066 |  0.076 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ |  0.001 | -0.065 |  0.064 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.005 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.001 | -0.074 |  0.067 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.004 | -0.090 |  0.091 |  0.051 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.008 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.063 |  0.062 |  0.034 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.068 |  0.072 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.005 | -0.060 |  0.063 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.004 | -0.089 |  0.091 |  0.053 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.090 |  0.091 |  0.055 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.062 |  0.063 |  0.034 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.080 |  0.073 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.000 | -0.090 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ |  0.002 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.007 | -0.090 |  0.089 |  0.048 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.088 |  0.055 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.003 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.066 |  0.077 |  0.020 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ |  0.005 | -0.065 |  0.064 |  0.041 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([120, 120]) || stage1.linear1.weight
+ | -0.001 | -0.090 |  0.091 |  0.057 | torch.Size([120]) || stage1.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.074 |  0.073 |  0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ |  0.001 | -0.090 |  0.089 |  0.051 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.009 | -0.090 |  0.090 |  0.051 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.004 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.063 |  0.035 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.093 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.003 | -0.090 |  0.091 |  0.056 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.004 | -0.091 |  0.089 |  0.054 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.007 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.005 | -0.091 |  0.086 |  0.052 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ |  0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ | -0.000 | -0.030 |  0.029 |  0.019 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ | -0.001 | -0.053 |  0.053 |  0.031 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ |  0.001 | -0.051 |  0.053 |  0.030 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ |  0.000 | -0.052 |  0.053 |  0.032 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage2.reshape.1.bias
+ |  0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ | -0.001 | -0.044 |  0.043 |  0.026 | torch.Size([120]) || stage2.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.067 |  0.061 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.001 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ |  0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.006 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.009 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.063 |  0.062 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.001 | -0.070 |  0.072 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ |  0.002 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.090 |  0.050 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.013 | -0.090 |  0.090 |  0.052 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.076 |  0.073 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.001 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.006 | -0.090 |  0.090 |  0.051 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.084 |  0.068 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.005 | -0.086 |  0.090 |  0.052 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.070 |  0.072 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ |  0.006 | -0.058 |  0.064 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.002 | -0.089 |  0.091 |  0.051 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.006 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.070 |  0.080 |  0.020 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.090 |  0.050 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ | -0.000 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.002 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.linear1.weight
+ |  0.005 | -0.091 |  0.091 |  0.055 | torch.Size([120]) || stage2.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.079 |  0.073 |  0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.091 |  0.088 |  0.052 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.064 |  0.063 |  0.035 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.076 |  0.082 |  0.020 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.001 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.002 | -0.065 |  0.064 |  0.037 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage2.linear2.weight
+ |  0.000 | -0.088 |  0.091 |  0.053 | torch.Size([120]) || stage2.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage2.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ | -0.002 | -0.030 |  0.029 |  0.017 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ | -0.001 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ | -0.002 | -0.053 |  0.052 |  0.030 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.001 | -0.052 |  0.053 |  0.031 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.031 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ |  0.001 | -0.045 |  0.051 |  0.029 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage3.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage3.reshape.1.bias
+ | -0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.001 | -0.045 |  0.045 |  0.028 | torch.Size([120]) || stage3.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.075 |  0.073 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ |  0.003 | -0.061 |  0.063 |  0.038 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.089 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.076 |  0.078 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.004 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.002 | -0.061 |  0.060 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.090 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.001 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.006 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.072 |  0.067 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.003 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.002 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.006 | -0.063 |  0.063 |  0.037 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.071 |  0.069 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.006 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.006 | -0.090 |  0.090 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.004 | -0.064 |  0.061 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.073 |  0.069 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.006 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.072 |  0.077 |  0.020 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.089 |  0.090 |  0.049 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ | -0.006 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.005 | -0.090 |  0.091 |  0.054 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.063 |  0.036 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage3.linear1.weight
+ | -0.002 | -0.091 |  0.091 |  0.052 | torch.Size([120]) || stage3.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.095 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.090 |  0.091 |  0.049 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.064 |  0.063 |  0.039 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.081 |  0.070 |  0.020 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ | -0.000 | -0.091 |  0.091 |  0.054 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.005 | -0.090 |  0.091 |  0.054 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.005 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.001 | -0.089 |  0.091 |  0.051 | torch.Size([120]) || stage3.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage3.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ |  0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ | -0.001 | -0.052 |  0.052 |  0.030 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.001 | -0.052 |  0.053 |  0.030 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ |  0.007 | -0.051 |  0.052 |  0.030 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([480]) || stage4.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([480]) || stage4.reshape.1.bias
+ | -0.000 | -0.046 |  0.046 |  0.026 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ |  0.003 | -0.045 |  0.045 |  0.028 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.068 |  0.084 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.006 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ |  0.003 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.090 |  0.091 |  0.051 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.004 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.090 |  0.089 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.076 |  0.082 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ | -0.001 | -0.064 |  0.063 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.002 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.005 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.006 | -0.090 |  0.090 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.062 |  0.064 |  0.036 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.071 |  0.082 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.063 |  0.064 |  0.041 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.003 | -0.091 |  0.089 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ |  0.006 | -0.091 |  0.090 |  0.050 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.000 | -0.088 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.063 |  0.040 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.083 |  0.065 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.005 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.062 |  0.034 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.078 |  0.072 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.004 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.005 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.004 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.005 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.079 |  0.076 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.050 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ | -0.002 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.005 | -0.090 |  0.089 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.003 | -0.063 |  0.063 |  0.038 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.004 | -0.089 |  0.090 |  0.054 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.081 |  0.077 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ | -0.005 | -0.090 |  0.091 |  0.051 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.003 | -0.088 |  0.091 |  0.052 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.064 |  0.065 |  0.039 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.074 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.090 |  0.050 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.005 | -0.090 |  0.088 |  0.053 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.005 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage4.linear2.weight
+ | -0.001 | -0.091 |  0.087 |  0.054 | torch.Size([120]) || stage4.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ |  0.001 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.030 |  0.029 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ | -0.001 | -0.053 |  0.052 |  0.031 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ |  0.001 | -0.053 |  0.052 |  0.031 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ |  0.003 | -0.053 |  0.052 |  0.029 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage5.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage5.reshape.1.bias
+ |  0.001 | -0.182 |  0.182 |  0.106 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ |  0.009 | -0.178 |  0.182 |  0.107 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ |  0.000 | -0.067 |  0.075 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.001 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ |  0.002 | -0.063 |  0.064 |  0.039 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.005 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.004 | -0.090 |  0.090 |  0.052 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.004 | -0.091 |  0.090 |  0.055 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.005 | -0.064 |  0.062 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.073 |  0.071 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ | -0.002 | -0.064 |  0.061 |  0.035 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.090 |  0.050 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.006 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.007 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.074 |  0.089 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ |  0.001 | -0.062 |  0.064 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.090 |  0.089 |  0.052 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.063 |  0.064 |  0.037 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.065 |  0.082 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.004 | -0.062 |  0.062 |  0.035 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.091 |  0.087 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.072 |  0.079 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.003 | -0.063 |  0.062 |  0.035 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.005 | -0.091 |  0.091 |  0.055 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.063 |  0.064 |  0.036 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.068 |  0.070 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ | -0.007 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.090 |  0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.004 | -0.091 |  0.091 |  0.051 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.064 |  0.040 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.linear1.weight
+ | -0.002 | -0.090 |  0.091 |  0.057 | torch.Size([120]) || stage5.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.101 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.090 |  0.091 |  0.053 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ |  0.006 | -0.090 |  0.091 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.004 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.090 |  0.050 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.063 |  0.039 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.087 |  0.084 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.089 |  0.091 |  0.053 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.002 | -0.091 |  0.091 |  0.050 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.062 |  0.064 |  0.039 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage5.linear2.weight
+ | -0.013 | -0.088 |  0.083 |  0.050 | torch.Size([120]) || stage5.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ |  0.001 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ | -0.001 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.000 | -0.053 |  0.053 |  0.031 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ |  0.001 | -0.053 |  0.053 |  0.030 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.006 | -0.050 |  0.051 |  0.028 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage6.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage6.reshape.1.bias
+ | -0.002 | -0.182 |  0.183 |  0.106 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ | -0.008 | -0.181 |  0.180 |  0.110 | torch.Size([120]) || stage6.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.069 |  0.069 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.002 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ | -0.005 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.002 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.007 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.064 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.068 |  0.074 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.004 | -0.090 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.065 |  0.062 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.090 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.090 |  0.090 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.002 | -0.064 |  0.063 |  0.039 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.080 |  0.079 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.001 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ |  0.010 | -0.065 |  0.064 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ |  0.004 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.004 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.069 |  0.074 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.005 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.004 | -0.088 |  0.087 |  0.047 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.000 | -0.062 |  0.064 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.065 |  0.074 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.007 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.051 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.006 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.000 | -0.062 |  0.064 |  0.037 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.069 |  0.075 |  0.020 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.004 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ | -0.001 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.003 | -0.090 |  0.090 |  0.055 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.002 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.065 |  0.038 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.linear1.weight
+ | -0.005 | -0.089 |  0.091 |  0.055 | torch.Size([120]) || stage6.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.077 |  0.081 |  0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ |  0.003 | -0.090 |  0.090 |  0.046 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.090 |  0.089 |  0.054 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.091 |  0.089 |  0.052 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.079 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.091 |  0.091 |  0.055 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.090 |  0.090 |  0.057 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.064 |  0.035 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage6.linear2.weight
+ |  0.002 | -0.091 |  0.091 |  0.055 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.013 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.001 | -0.030 |  0.030 |  0.019 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ | -0.001 | -0.029 |  0.029 |  0.017 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ | -0.001 | -0.053 |  0.053 |  0.030 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ |  0.000 | -0.052 |  0.053 |  0.031 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.000 | -0.051 |  0.052 |  0.031 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([30]) || stage7.reshape.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([30]) || stage7.reshape.1.bias
+ |  0.001 | -0.183 |  0.182 |  0.106 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ | -0.004 | -0.178 |  0.182 |  0.104 | torch.Size([120]) || stage7.reshape.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.061 |  0.074 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ | -0.002 | -0.064 |  0.064 |  0.034 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.001 | -0.090 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.064 |  0.064 |  0.039 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.000 | -0.069 |  0.071 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ | -0.007 | -0.064 |  0.063 |  0.035 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.091 |  0.055 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.003 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.006 | -0.064 |  0.059 |  0.038 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.083 |  0.070 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ | -0.001 | -0.061 |  0.064 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.006 | -0.091 |  0.091 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.001 | -0.090 |  0.091 |  0.055 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.000 | -0.090 |  0.090 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.000 | -0.064 |  0.063 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.066 |  0.069 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ | -0.000 | -0.064 |  0.064 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.004 | -0.091 |  0.090 |  0.051 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.002 | -0.090 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.090 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.001 | -0.064 |  0.062 |  0.039 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.081 |  0.067 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.002 | -0.091 |  0.089 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.063 |  0.063 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.090 |  0.089 |  0.054 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.005 | -0.090 |  0.091 |  0.051 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.063 |  0.063 |  0.037 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ |  0.000 | -0.070 |  0.076 |  0.020 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.004 | -0.091 |  0.090 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.001 | -0.063 |  0.063 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.008 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.003 | -0.091 |  0.091 |  0.054 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.004 | -0.062 |  0.064 |  0.036 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.linear1.weight
+ | -0.007 | -0.091 |  0.090 |  0.051 | torch.Size([120]) || stage7.linear1.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.090 |  0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.091 |  0.090 |  0.054 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.090 |  0.087 |  0.055 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.091 |  0.088 |  0.051 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.091 |  0.052 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.063 |  0.064 |  0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.079 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.091 |  0.090 |  0.052 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ |  0.007 | -0.090 |  0.090 |  0.052 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.091 |  0.091 |  0.053 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.091 |  0.090 |  0.052 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.065 |  0.065 |  0.037 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.005 | -0.060 |  0.064 |  0.036 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([120, 120]) || stage7.linear2.weight
+ | -0.009 | -0.087 |  0.087 |  0.048 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.021 |  0.021 |  0.012 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ |  0.002 | -0.020 |  0.021 |  0.012 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ |  0.000 | -0.030 |  0.030 |  0.016 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.000 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.000 | -0.052 |  0.052 |  0.029 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ |  0.002 | -0.053 |  0.053 |  0.031 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.031 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage8.0.1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([120]) || stage8.0.1.bias
+ |  0.000 | -0.091 |  0.091 |  0.053 | torch.Size([180, 120]) || stage8.0.2.weight
+ | -0.001 | -0.090 |  0.090 |  0.053 | torch.Size([180]) || stage8.0.2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ |  0.000 | -0.075 |  0.081 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.075 |  0.074 |  0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ |  0.001 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.075 |  0.074 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.000 | -0.052 |  0.053 |  0.032 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.073 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ |  0.003 | -0.073 |  0.074 |  0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.075 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.073 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.031 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.029 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ |  0.000 | -0.072 |  0.078 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ |  0.000 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.073 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.049 |  0.053 |  0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ | -0.000 | -0.071 |  0.085 |  0.020 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ |  0.002 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.073 |  0.074 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ | -0.005 | -0.053 |  0.052 |  0.030 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.1.linear.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.1.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ |  0.000 | -0.075 |  0.080 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ |  0.001 | -0.072 |  0.074 |  0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.074 |  0.073 |  0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.074 |  0.074 |  0.041 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.084 |  0.071 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.040 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ | -0.002 | -0.074 |  0.070 |  0.042 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ | -0.001 | -0.075 |  0.073 |  0.041 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.053 |  0.052 |  0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.086 |  0.076 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ |  0.002 | -0.073 |  0.074 |  0.041 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ |  0.000 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.031 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.053 |  0.053 |  0.031 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.078 |  0.070 |  0.020 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ | -0.002 | -0.074 |  0.075 |  0.046 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.030 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.004 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.2.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.087 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.075 |  0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ |  0.004 | -0.072 |  0.074 |  0.041 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.073 |  0.074 |  0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.000 | -0.053 |  0.052 |  0.031 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.074 |  0.073 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ |  0.002 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.053 |  0.051 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.085 |  0.087 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.075 |  0.074 |  0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ | -0.005 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ |  0.004 | -0.074 |  0.075 |  0.045 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.074 |  0.071 |  0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.053 |  0.030 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ | -0.000 | -0.077 |  0.093 |  0.020 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ |  0.002 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ | -0.001 | -0.052 |  0.053 |  0.032 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.3.linear.weight
+ |  0.002 | -0.074 |  0.073 |  0.042 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ |  0.000 | -0.074 |  0.082 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ |  0.003 | -0.074 |  0.074 |  0.042 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.075 |  0.045 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.073 |  0.074 |  0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.001 | -0.053 |  0.053 |  0.029 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.000 | -0.077 |  0.076 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ | -0.004 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.074 |  0.045 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ |  0.003 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.075 |  0.073 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.002 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ | -0.000 | -0.074 |  0.074 |  0.045 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.074 |  0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.073 |  0.042 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ |  0.001 | -0.053 |  0.053 |  0.030 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.082 |  0.087 |  0.020 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ |  0.003 | -0.074 |  0.073 |  0.044 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.003 | -0.073 |  0.074 |  0.041 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.4.linear.weight
+ |  0.000 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.000 | -0.060 |  0.059 |  0.019 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ | -0.003 | -0.074 |  0.072 |  0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ |  0.001 | -0.059 |  0.062 |  0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.003 | -0.075 |  0.075 |  0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ | -0.002 | -0.074 |  0.074 |  0.041 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ | -0.005 | -0.074 |  0.074 |  0.045 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.053 |  0.052 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.001 | -0.074 |  0.060 |  0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.001 | -0.073 |  0.073 |  0.045 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ | -0.004 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.075 |  0.075 |  0.044 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ | -0.002 | -0.053 |  0.052 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ | -0.000 | -0.064 |  0.085 |  0.020 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ |  0.000 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.5.linear.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ |  0.000 | -0.064 |  0.057 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.074 |  0.074 |  0.042 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ | -0.003 | -0.075 |  0.073 |  0.042 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ |  0.001 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.074 |  0.072 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.052 |  0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ |  0.001 | -0.061 |  0.074 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.074 |  0.074 |  0.044 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ |  0.001 | -0.073 |  0.070 |  0.042 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.053 |  0.032 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ | -0.000 | -0.059 |  0.058 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.074 |  0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.004 | -0.074 |  0.074 |  0.043 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.005 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.074 |  0.075 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.051 |  0.030 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ |  0.000 | -0.070 |  0.061 |  0.020 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.074 |  0.075 |  0.043 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.000 | -0.072 |  0.074 |  0.044 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.002 | -0.074 |  0.075 |  0.043 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.074 |  0.074 |  0.044 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.053 |  0.053 |  0.030 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.052 |  0.053 |  0.031 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.075 |  0.075 |  0.043 | torch.Size([180, 180]) || stage8.6.linear.weight
+ |  0.002 | -0.073 |  0.074 |  0.042 | torch.Size([180]) || stage8.6.linear.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([180]) || norm.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([180]) || norm.bias
+ |  0.000 | -0.075 |  0.075 |  0.043 | torch.Size([120, 180]) || conv_after_body.weight
+ |  0.004 | -0.071 |  0.072 |  0.043 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -0.030 |  0.030 |  0.018 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ | -0.003 | -0.029 |  0.029 |  0.018 | torch.Size([64]) || conv_before_upsample.0.bias
+ | -0.000 | -0.042 |  0.042 |  0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ | -0.001 | -0.042 |  0.041 |  0.023 | torch.Size([256]) || upsample.0.bias
+ | -0.000 | -0.042 |  0.042 |  0.024 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ | -0.001 | -0.041 |  0.041 |  0.023 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.042 |  0.042 |  0.024 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.006 | -0.038 |  0.041 |  0.022 | torch.Size([64]) || upsample.10.bias
+ |  0.001 | -0.042 |  0.042 |  0.024 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.011 | -0.006 |  0.025 |  0.016 | torch.Size([3]) || conv_last.bias
+
+22-03-11 10:16:36.045 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: None
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/val/val_sharp
+      dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic
+      meta_info_file: 
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:19:49.922 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/val/val_sharp
+      dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic
+      meta_info_file: 
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:21:14.310 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/val/val_sharp
+      dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:21:14.354 : Number of train images: 27,000, iters: 3,375
+22-03-11 10:22:14.208 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/val/val_sharp
+      dataroot_lq: /home/cll/datasets/REDS/val/val_sharp_bicubic
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:22:14.252 : Number of train images: 27,000, iters: 3,375
+22-03-11 10:22:28.605 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 10:22:28.777 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ | -0.000 | -1.462 |  1.580 |  0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ |  0.005 | -0.950 |  0.885 |  0.268 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.679 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.042 | -0.894 |  0.351 |  0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.008 | -3.201 |  0.948 |  0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.059 | -1.268 |  0.732 |  0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.633 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.159 | -0.704 |  0.859 |  0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.024 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.780 | -1.061 |  1.162 |  0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.144 |  0.163 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ |  0.001 | -0.003 |  0.005 |  0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ |  0.000 | -0.726 |  0.773 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.021 | -0.814 |  0.355 |  0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.010 | -3.380 |  0.916 |  0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.038 | -1.207 |  0.714 |  0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.462 |  0.549 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.157 | -0.742 |  0.980 |  0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.020 | -1.648 |  0.319 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.775 | -1.195 |  1.148 |  0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ | -0.000 | -0.122 |  0.152 |  0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ | -0.000 | -0.002 |  0.001 |  0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -0.956 |  0.870 |  0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.025 | -1.040 |  0.512 |  0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.011 | -4.624 |  1.195 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.023 | -1.284 |  0.699 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.009 | -1.831 |  0.616 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.120 | -0.695 |  0.755 |  0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.013 | -1.285 |  0.304 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.681 | -1.725 |  0.942 |  0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.045 |  0.071 |  0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.010 | -0.010 | -0.009 |  0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ | -0.000 | -0.995 |  0.879 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.040 | -1.137 |  0.617 |  0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.010 | -4.891 |  1.224 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.022 | -1.287 |  0.745 |  0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.010 | -1.802 |  0.561 |  0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.118 | -0.694 |  0.697 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.012 | -1.107 |  0.306 |  0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.658 | -1.792 |  0.905 |  0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.030 |  0.037 |  0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.003 | -0.001 |  0.007 |  0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -0.990 |  0.880 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ | -0.010 | -1.067 |  0.596 |  0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.010 | -5.061 |  1.229 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.024 | -1.274 |  0.830 |  0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.009 | -1.787 |  0.563 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.130 | -0.685 |  0.743 |  0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.011 | -0.973 |  0.292 |  0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.659 | -1.855 |  0.931 |  0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.034 |  0.040 |  0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.001 | -0.009 |  0.007 |  0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -0.973 |  0.853 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.022 | -1.001 |  0.571 |  0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.009 | -5.095 |  1.251 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.026 | -1.305 |  0.880 |  0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.815 |  0.561 |  0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.137 | -0.711 |  0.771 |  0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.010 | -0.986 |  0.286 |  0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.671 | -1.913 |  0.966 |  0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.034 |  0.028 |  0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ |  0.002 | -0.013 |  0.016 |  0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.280 |  0.669 |  1.862 |  0.274 | torch.Size([120]) || stage1.reshape.1.weight
+ | -0.006 | -0.324 |  0.337 |  0.106 | torch.Size([120]) || stage1.reshape.1.bias
+ |  0.579 |  0.129 |  1.064 |  0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ | -0.039 | -1.100 |  0.894 |  0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ | -0.134 | -4.020 |  2.585 |  0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.579 |  0.618 |  0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.319 |  0.279 |  0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.634 |  0.686 |  0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ | -0.014 | -0.222 |  0.642 |  0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -1.066 |  0.928 |  0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.146 |  0.190 |  0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.781 |  0.367 |  1.203 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.029 | -0.378 |  0.545 |  0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.687 |  0.753 |  0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.010 | -0.229 |  0.633 |  0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.674 |  0.669 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.011 | -0.448 |  0.368 |  0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.862 |  0.941 |  0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.267 |  0.594 |  0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.797 |  0.211 |  1.475 |  0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ | -0.161 | -1.941 |  0.746 |  0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ | -0.296 | -3.927 |  2.840 |  0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.001 | -1.479 |  1.395 |  0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.381 |  0.258 |  0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.526 |  0.561 |  0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.178 |  0.478 |  0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -1.242 |  1.138 |  0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.004 | -0.213 |  0.196 |  0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.702 |  0.349 |  0.904 |  0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.039 | -0.646 |  0.384 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.872 |  0.750 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.049 | -0.353 |  0.135 |  0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.562 |  0.580 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.238 |  0.457 |  0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.828 |  0.685 |  0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.031 | -0.297 |  0.419 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.984 |  0.163 |  1.398 |  0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ | -0.167 | -1.609 |  0.367 |  0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.343 | -4.484 |  2.362 |  0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -1.586 |  1.649 |  0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.240 |  0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.378 |  0.514 |  0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ | -0.009 | -0.143 |  0.172 |  0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ |  0.001 | -0.639 |  0.582 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.173 |  0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.733 |  0.277 |  0.903 |  0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.038 | -0.861 |  0.359 |  0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.787 |  0.679 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.029 | -0.365 |  0.143 |  0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.574 |  0.539 |  0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.283 |  0.254 |  0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.001 | -0.998 |  0.522 |  0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.030 | -0.169 |  0.293 |  0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.035 |  0.143 |  1.397 |  0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ | -0.161 | -1.413 |  0.084 |  0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ | -0.441 | -4.685 |  3.306 |  0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -1.590 |  1.329 |  0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.266 |  0.232 |  0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.366 |  0.372 |  0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.011 | -0.225 |  0.171 |  0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.660 |  0.801 |  0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.139 |  0.200 |  0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.724 |  0.190 |  0.911 |  0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.038 | -0.981 |  0.285 |  0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.611 |  0.598 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.035 | -0.299 |  0.221 |  0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.502 |  0.520 |  0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.271 |  0.215 |  0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.558 |  0.898 |  0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.010 | -0.424 |  0.190 |  0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.085 |  0.169 |  1.400 |  0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ | -0.086 | -1.613 |  0.150 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ | -0.541 | -3.902 |  3.728 |  0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ |  0.001 | -1.879 |  1.832 |  0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.001 | -0.391 |  0.444 |  0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.407 |  0.448 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ | -0.013 | -0.302 |  0.342 |  0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ | -0.001 | -0.830 |  0.863 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.117 |  0.094 |  0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.704 |  0.195 |  0.870 |  0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.031 | -1.069 |  0.276 |  0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.656 |  0.555 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.029 | -0.387 |  0.256 |  0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.001 | -0.590 |  0.624 |  0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.011 | -0.277 |  0.303 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -1.124 |  0.539 |  0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.006 | -0.718 |  0.133 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.037 |  0.176 |  1.327 |  0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ | -0.112 | -1.591 |  0.177 |  0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ | -0.438 | -2.229 |  2.797 |  0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -2.212 |  1.826 |  0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.343 |  0.338 |  0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.367 |  0.451 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ | -0.022 | -0.358 |  0.242 |  0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ |  0.001 | -0.922 |  0.886 |  0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.002 | -0.083 |  0.089 |  0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.662 |  0.277 |  0.831 |  0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.025 | -0.959 |  0.261 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ | -0.001 | -0.636 |  0.739 |  0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.030 | -0.419 |  0.517 |  0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.615 |  0.709 |  0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.002 | -0.230 |  0.457 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.001 | -1.724 |  1.186 |  0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.019 | -1.909 |  0.255 |  0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.242 |  0.244 |  0.057 | torch.Size([120, 120]) || stage1.linear1.weight
+ |  0.004 | -0.221 |  0.224 |  0.083 | torch.Size([120]) || stage1.linear1.bias
+ |  0.737 |  0.334 |  1.046 |  0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.013 | -0.911 |  0.763 |  0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ | -0.052 | -2.462 |  2.040 |  0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.785 |  0.767 |  0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.009 | -0.466 |  0.552 |  0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.431 |  0.475 |  0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ | -0.009 | -0.796 |  0.497 |  0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  0.573 |  0.409 |  0.935 |  0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.015 | -0.828 |  0.839 |  0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.604 |  0.542 |  0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.037 | -0.179 |  0.273 |  0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.666 |  0.553 |  0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.416 |  0.396 |  0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.654 |  0.538 |  0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.470 |  0.310 |  0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.951 |  0.342 |  1.189 |  0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.010 | -0.697 |  0.802 |  0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ | -0.098 | -2.648 |  2.410 |  0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.733 |  0.886 |  0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.468 |  0.550 |  0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.435 |  0.377 |  0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.359 |  0.258 |  0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  0.582 |  0.305 |  0.717 |  0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.008 | -0.714 |  0.833 |  0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.001 | -0.732 |  0.501 |  0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.306 |  0.267 |  0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.510 |  0.533 |  0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.315 |  0.291 |  0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.736 |  0.789 |  0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -1.274 |  1.328 |  0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.390 |  0.303 |  0.069 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.010 | -0.219 |  0.227 |  0.087 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.095 |  0.106 |  0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ | -0.001 | -0.036 |  0.036 |  0.013 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.136 |  0.141 |  0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.028 |  0.024 |  0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.156 |  0.104 |  0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.055 |  0.045 |  0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.098 |  0.106 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ | -0.000 | -0.081 |  0.070 |  0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.375 |  0.279 |  0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.074 |  0.070 |  0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.776 |  0.733 |  0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ |  0.021 | -0.239 |  0.513 |  0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ |  0.001 | -1.100 |  1.143 |  0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ |  0.008 | -0.405 |  0.393 |  0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ |  0.000 | -0.963 |  0.899 |  0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ | -0.055 | -0.616 |  0.599 |  0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.149 |  0.345 |  1.921 |  0.289 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.017 | -0.502 |  0.663 |  0.141 | torch.Size([480]) || stage2.reshape.1.bias
+ | -0.000 | -0.609 |  0.736 |  0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ |  0.006 | -0.136 |  0.404 |  0.077 | torch.Size([120]) || stage2.reshape.2.bias
+ |  0.686 |  0.172 |  1.113 |  0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ | -0.154 | -0.926 |  0.339 |  0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ | -0.120 | -1.869 |  4.616 |  0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.514 |  0.499 |  0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.214 |  0.177 |  0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.499 |  0.529 |  0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ | -0.004 | -0.171 |  0.556 |  0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.642 |  0.598 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.125 |  0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.592 |  0.325 |  0.794 |  0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.008 | -0.649 |  0.445 |  0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.485 |  0.457 |  0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.053 | -0.240 |  0.171 |  0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.462 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.177 |  0.268 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.690 |  0.498 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.007 | -0.270 |  0.472 |  0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.864 |  0.187 |  1.221 |  0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ | -0.146 | -1.128 |  0.299 |  0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.241 | -1.607 |  8.958 |  0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.561 |  0.538 |  0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.198 |  0.222 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.475 |  0.479 |  0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ | -0.006 | -0.295 |  0.341 |  0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -0.961 |  0.789 |  0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.105 |  0.143 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.653 |  0.401 |  0.810 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.009 | -0.767 |  0.367 |  0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.486 |  0.499 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.056 | -0.185 |  0.147 |  0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.548 |  0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.231 |  0.177 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.578 |  0.609 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.350 |  0.216 |  0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.848 |  0.172 |  1.107 |  0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ | -0.168 | -1.123 |  0.330 |  0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ | -0.074 | -1.239 |  4.293 |  0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ | -0.001 | -0.643 |  0.531 |  0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.220 |  0.376 |  0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.529 |  0.479 |  0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.230 |  0.295 |  0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -0.726 |  0.768 |  0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.167 |  0.193 |  0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.695 |  0.334 |  0.833 |  0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.012 | -0.755 |  0.517 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.474 |  0.480 |  0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.049 | -0.218 |  0.148 |  0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.542 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.006 | -0.245 |  0.239 |  0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.541 |  0.485 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.318 |  0.170 |  0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.903 |  0.178 |  1.124 |  0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ | -0.138 | -1.223 |  0.440 |  0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.164 | -1.383 |  5.910 |  0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.526 |  0.496 |  0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.250 |  0.273 |  0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.447 |  0.524 |  0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.243 |  0.256 |  0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.551 |  0.730 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.145 |  0.126 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.707 |  0.319 |  0.855 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.013 | -0.839 |  0.507 |  0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.509 |  0.508 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.051 | -0.219 |  0.155 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.475 |  0.592 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.162 |  0.220 |  0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.465 |  0.528 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.243 |  0.286 |  0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.948 |  0.220 |  1.175 |  0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ | -0.125 | -1.093 |  0.385 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ | -0.150 | -1.632 |  4.522 |  0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.636 |  0.543 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.254 |  0.262 |  0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.001 | -0.632 |  0.628 |  0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ | -0.005 | -0.240 |  0.330 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.476 |  0.479 |  0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.112 |  0.134 |  0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.686 |  0.264 |  0.797 |  0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.012 | -0.889 |  0.427 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.476 |  0.478 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.051 | -0.267 |  0.180 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.506 |  0.517 |  0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.172 |  0.241 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.570 |  0.542 |  0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.003 | -0.631 |  0.395 |  0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.912 |  0.189 |  1.122 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ | -0.114 | -1.125 |  0.188 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ | -0.099 | -1.285 |  1.708 |  0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.496 |  0.540 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.003 | -0.260 |  0.228 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.511 |  0.454 |  0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.711 |  0.286 |  0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.444 |  0.454 |  0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.101 |  0.133 |  0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.668 |  0.312 |  0.800 |  0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.015 | -0.778 |  0.372 |  0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.485 |  0.469 |  0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.045 | -0.294 |  0.173 |  0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.554 |  0.540 |  0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.001 | -0.183 |  0.199 |  0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.879 |  0.824 |  0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.670 |  0.358 |  0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.253 |  0.346 |  0.068 | torch.Size([120, 120]) || stage2.linear1.weight
+ |  0.007 | -0.248 |  0.241 |  0.103 | torch.Size([120]) || stage2.linear1.bias
+ |  1.012 |  0.613 |  1.327 |  0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.019 | -0.724 |  0.685 |  0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ |  0.003 | -2.959 |  1.705 |  0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.636 |  0.617 |  0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.291 |  0.292 |  0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.002 | -0.476 |  0.512 |  0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.263 |  0.398 |  0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  0.677 |  0.521 |  0.840 |  0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.010 | -0.710 |  0.541 |  0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.540 |  0.507 |  0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.016 | -0.242 |  0.201 |  0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.519 |  0.479 |  0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.162 |  0.231 |  0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.449 |  0.494 |  0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.293 |  0.222 |  0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.053 |  0.832 |  1.269 |  0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.015 | -0.549 |  0.428 |  0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ |  0.007 | -3.099 |  1.550 |  0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.673 |  0.604 |  0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.416 |  0.391 |  0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.569 |  0.560 |  0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ |  0.004 | -0.613 |  0.428 |  0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  0.762 |  0.464 |  0.954 |  0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.005 | -0.745 |  0.381 |  0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.441 |  0.448 |  0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.019 | -0.292 |  0.460 |  0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.491 |  0.490 |  0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.285 |  0.177 |  0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.535 |  0.631 |  0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.011 | -0.765 |  0.337 |  0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.001 | -0.367 |  0.372 |  0.074 | torch.Size([120, 120]) || stage2.linear2.weight
+ |  0.009 | -0.288 |  0.342 |  0.130 | torch.Size([120]) || stage2.linear2.bias
+ |  0.000 | -0.112 |  0.093 |  0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ | -0.002 | -0.036 |  0.035 |  0.016 | torch.Size([120]) || stage2.pa_deform.bias
+ |  0.000 | -0.068 |  0.080 |  0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ | -0.009 | -0.035 |  0.023 |  0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.068 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ | -0.014 | -0.061 |  0.036 |  0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.082 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ | -0.003 | -0.075 |  0.069 |  0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.166 |  0.139 |  0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ | -0.015 | -0.090 |  0.050 |  0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.642 |  0.663 |  0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ |  0.130 | -0.171 |  0.480 |  0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.696 |  0.620 |  0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.007 | -0.337 |  0.301 |  0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ |  0.000 | -0.650 |  0.657 |  0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ |  0.013 | -0.507 |  0.451 |  0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.067 |  0.372 |  1.778 |  0.269 | torch.Size([480]) || stage3.reshape.1.weight
+ | -0.004 | -0.699 |  0.521 |  0.227 | torch.Size([480]) || stage3.reshape.1.bias
+ | -0.000 | -0.643 |  0.743 |  0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.009 | -0.176 |  0.243 |  0.079 | torch.Size([120]) || stage3.reshape.2.bias
+ |  0.785 |  0.469 |  1.029 |  0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ | -0.102 | -0.716 |  0.311 |  0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ | -0.001 | -0.340 |  0.163 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.328 |  0.302 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.004 | -0.232 |  0.189 |  0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.343 |  0.346 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ |  0.004 | -0.335 |  0.229 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.366 |  0.325 |  0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.074 |  0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.751 |  0.517 |  0.928 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.002 | -0.271 |  0.189 |  0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.371 |  0.388 |  0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.073 | -0.203 |  0.039 |  0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.400 |  0.401 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.178 |  0.128 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.410 |  0.429 |  0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.006 | -0.345 |  0.304 |  0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.816 |  0.469 |  1.015 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ | -0.103 | -0.647 |  0.225 |  0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ |  0.001 | -0.464 |  0.239 |  0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.304 |  0.359 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.173 |  0.193 |  0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.299 |  0.408 |  0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.007 | -0.511 |  0.239 |  0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.288 |  0.254 |  0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.060 |  0.054 |  0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.796 |  0.609 |  0.971 |  0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ | -0.002 | -0.327 |  0.247 |  0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.379 |  0.407 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.214 |  0.034 |  0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.391 |  0.432 |  0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.176 |  0.112 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.378 |  0.399 |  0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.009 | -0.410 |  0.306 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.854 |  0.447 |  0.995 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ | -0.086 | -0.513 |  0.198 |  0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.001 | -0.189 |  0.292 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.390 |  0.367 |  0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.310 |  0.284 |  0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.334 |  0.296 |  0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.356 |  0.299 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.276 |  0.315 |  0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.094 |  0.066 |  0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.829 |  0.673 |  1.017 |  0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.003 | -0.259 |  0.228 |  0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.410 |  0.385 |  0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.085 | -0.200 |  0.017 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.348 |  0.378 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.130 |  0.105 |  0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.346 |  0.425 |  0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.005 | -0.363 |  0.241 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.872 |  0.554 |  1.068 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ | -0.057 | -0.402 |  0.133 |  0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ |  0.003 | -0.365 |  0.217 |  0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.359 |  0.357 |  0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.265 |  0.294 |  0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.300 |  0.271 |  0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.316 |  0.215 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.370 |  0.329 |  0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.056 |  0.066 |  0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.842 |  0.631 |  0.989 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ | -0.001 | -0.216 |  0.263 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.388 |  0.391 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.087 | -0.202 |  0.032 |  0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.364 |  0.428 |  0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.137 |  0.106 |  0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.001 | -0.390 |  0.339 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.376 |  0.203 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.913 |  0.498 |  1.102 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ | -0.048 | -0.340 |  0.105 |  0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ |  0.001 | -0.706 |  0.306 |  0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.373 |  0.339 |  0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.004 | -0.301 |  0.301 |  0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.278 |  0.277 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ |  0.003 | -0.310 |  0.240 |  0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.350 |  0.322 |  0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.064 |  0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.862 |  0.679 |  0.990 |  0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ | -0.004 | -0.313 |  0.190 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.370 |  0.364 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.092 | -0.231 |  0.129 |  0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.375 |  0.511 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.114 |  0.114 |  0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.389 |  0.354 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.005 | -0.258 |  0.164 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.899 |  0.480 |  1.089 |  0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ | -0.030 | -0.257 |  0.115 |  0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ |  0.003 | -0.462 |  0.290 |  0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.391 |  0.365 |  0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.004 | -0.232 |  0.302 |  0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.267 |  0.293 |  0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.250 |  0.182 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.238 |  0.257 |  0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.032 |  0.033 |  0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.864 |  0.651 |  1.029 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ | -0.003 | -0.212 |  0.175 |  0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.378 |  0.379 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.097 | -0.308 |  0.026 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.578 |  0.401 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.166 |  0.131 |  0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.358 |  0.376 |  0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -0.262 |  0.176 |  0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.003 | -0.284 |  0.467 |  0.071 | torch.Size([120, 120]) || stage3.linear1.weight
+ |  0.006 | -0.201 |  0.269 |  0.090 | torch.Size([120]) || stage3.linear1.bias
+ |  0.877 |  0.568 |  1.197 |  0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.002 | -0.248 |  0.324 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.261 |  0.125 |  0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.563 |  0.552 |  0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.257 |  0.302 |  0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.390 |  0.385 |  0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.450 |  0.235 |  0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  0.986 |  0.755 |  1.165 |  0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.260 |  0.169 |  0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.355 |  0.397 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.046 | -0.220 |  0.086 |  0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.424 |  0.368 |  0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.111 |  0.122 |  0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.354 |  0.374 |  0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.374 |  0.272 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.919 |  0.643 |  1.132 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.177 |  0.181 |  0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.332 |  0.131 |  0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.418 |  0.362 |  0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.375 |  0.347 |  0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.294 |  0.354 |  0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ |  0.003 | -0.432 |  0.259 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.012 |  0.750 |  1.178 |  0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ | -0.001 | -0.171 |  0.155 |  0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.331 |  0.356 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.035 | -0.207 |  0.197 |  0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.399 |  0.398 |  0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.111 |  0.129 |  0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.353 |  0.330 |  0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.328 |  0.127 |  0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.003 | -0.289 |  0.519 |  0.073 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.002 | -0.318 |  0.371 |  0.144 | torch.Size([120]) || stage3.linear2.bias
+ | -0.000 | -0.086 |  0.095 |  0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ | -0.002 | -0.023 |  0.021 |  0.010 | torch.Size([120]) || stage3.pa_deform.bias
+ | -0.000 | -0.060 |  0.056 |  0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ | -0.008 | -0.035 |  0.019 |  0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.064 |  0.062 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ | -0.007 | -0.044 |  0.031 |  0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.062 |  0.063 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ | -0.006 | -0.052 |  0.043 |  0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.081 |  0.080 |  0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ | -0.004 | -0.087 |  0.083 |  0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.465 |  0.513 |  0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ |  0.059 | -0.251 |  0.595 |  0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ | -0.000 | -0.544 |  0.531 |  0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.001 | -0.589 |  0.433 |  0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ | -0.000 | -0.535 |  0.562 |  0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ | -0.001 | -0.401 |  0.342 |  0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  0.997 |  0.921 |  1.125 |  0.028 | torch.Size([480]) || stage4.reshape.1.weight
+ | -0.000 | -0.058 |  0.059 |  0.022 | torch.Size([480]) || stage4.reshape.1.bias
+ |  0.000 | -0.155 |  0.150 |  0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ |  0.001 | -0.016 |  0.016 |  0.006 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.002 |  0.999 |  1.009 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.071 |  0.066 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.093 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.009 |  0.009 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.097 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.035 |  0.027 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.080 |  0.079 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.087 |  0.092 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.080 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.031 |  0.029 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.002 |  0.997 |  1.007 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.066 |  0.065 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.080 |  0.083 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ | -0.000 | -0.027 |  0.029 |  0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.077 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.006 |  0.009 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.080 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.084 |  0.075 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.034 |  0.031 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.996 |  1.008 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ |  0.001 | -0.070 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.007 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ | -0.000 | -0.023 |  0.026 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.107 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.000 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.076 |  0.077 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.005 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -2.000 |  0.081 |  0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.084 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.027 |  0.024 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.999 |  1.012 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.064 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.099 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.083 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ | -0.000 | -0.019 |  0.018 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.079 |  0.084 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.087 |  0.076 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.079 |  0.082 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.011 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ | -0.001 | -0.004 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.089 |  0.081 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.080 |  0.085 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.000 | -0.021 |  0.016 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.082 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.086 |  0.080 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.084 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.081 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.018 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.003 |  0.997 |  1.014 |  0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ | -0.001 | -0.005 |  0.004 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ | -0.001 | -0.070 |  0.069 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.097 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.089 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.016 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.083 |  0.091 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.093 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.000 | -0.002 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.086 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.079 |  0.092 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.000 | -0.012 |  0.016 |  0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.090 |  0.111 |  0.024 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.001 | -0.019 |  0.029 |  0.009 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  0.999 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.084 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.079 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ |  0.000 | -0.021 |  0.024 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.072 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.102 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.024 |  0.020 |  0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.998 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.078 |  0.096 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.020 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.085 |  0.082 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.000 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.078 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.092 |  0.112 |  0.023 | torch.Size([120, 120]) || stage4.linear2.weight
+ |  0.000 | -0.032 |  0.049 |  0.015 | torch.Size([120]) || stage4.linear2.bias
+ |  0.000 | -0.036 |  0.037 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.022 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ |  0.002 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ | -0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.003 |  0.002 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.172 |  0.177 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ |  0.002 | -0.027 |  0.088 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ |  0.000 | -0.212 |  0.163 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ |  0.000 | -0.066 |  0.081 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ |  0.000 | -0.413 |  0.387 |  0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ | -0.001 | -0.198 |  0.214 |  0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  0.979 |  0.896 |  1.076 |  0.053 | torch.Size([30]) || stage5.reshape.1.weight
+ | -0.005 | -0.074 |  0.100 |  0.043 | torch.Size([30]) || stage5.reshape.1.bias
+ |  0.000 | -0.240 |  0.249 |  0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ | -0.002 | -0.286 |  0.229 |  0.080 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.001 |  0.993 |  1.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ | -0.004 | -0.018 |  0.006 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.066 |  0.062 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.086 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.012 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.166 |  0.172 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.053 |  0.045 |  0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.090 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.999 |  0.987 |  1.001 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.094 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.022 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.082 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.014 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.075 |  0.083 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.073 |  0.078 |  0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.994 |  1.007 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ | -0.004 | -0.016 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.065 |  0.063 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.077 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.022 |  0.017 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.113 |  0.098 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.058 |  0.045 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.080 |  0.080 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.008 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.999 |  0.982 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.006 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.076 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.017 |  0.014 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.080 |  0.086 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.014 |  0.016 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.096 |  0.079 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.039 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.009 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ | -0.004 | -0.014 |  0.003 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.067 |  0.073 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.085 |  0.087 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.015 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.108 |  0.095 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ | -0.001 | -0.043 |  0.039 |  0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.088 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.009 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.999 |  0.978 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.003 |  0.004 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.076 |  0.081 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.012 |  0.019 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.079 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.014 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.082 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.000 | -0.047 |  0.043 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.978 |  1.015 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ | -0.004 | -0.013 |  0.004 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.084 |  0.070 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.078 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.123 |  0.132 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.001 | -0.028 |  0.044 |  0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.082 |  0.089 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.999 |  0.974 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.008 |  0.010 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.088 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.000 | -0.014 |  0.019 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.031 |  0.020 |  0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.106 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.046 |  0.042 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.003 |  0.944 |  1.017 |  0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ | -0.005 | -0.015 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.071 |  0.067 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.085 |  0.090 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.021 |  0.013 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.130 |  0.089 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.036 |  0.024 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.086 |  0.076 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.999 |  0.967 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.006 |  0.007 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.080 |  0.085 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.001 | -0.015 |  0.010 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.000 | -0.020 |  0.018 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.037 |  0.050 |  0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.004 |  0.976 |  1.039 |  0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ | -0.005 | -0.015 |  0.005 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.070 |  0.076 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.099 |  0.097 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.011 |  0.012 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.084 |  0.093 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.038 |  0.035 |  0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.087 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.010 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.998 |  0.960 |  1.002 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.088 |  0.095 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.000 | -0.014 |  0.027 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.074 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.025 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.100 |  0.086 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.030 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.102 |  0.117 |  0.023 | torch.Size([120, 120]) || stage5.linear1.weight
+ | -0.003 | -0.297 |  0.242 |  0.084 | torch.Size([120]) || stage5.linear1.bias
+ |  0.999 |  0.971 |  1.008 |  0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ | -0.000 | -0.035 |  0.034 |  0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.079 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.087 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.028 |  0.018 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.079 |  0.082 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.146 |  0.171 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  0.997 |  0.967 |  1.003 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.073 |  0.089 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.017 |  0.008 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.084 |  0.073 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.011 |  0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.103 |  0.140 |  0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.999 |  0.986 |  1.010 |  0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.035 |  0.034 |  0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.087 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.084 |  0.079 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.024 |  0.024 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.078 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.112 |  0.144 |  0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  0.998 |  0.965 |  1.004 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 | -0.004 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.088 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.012 |  0.015 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.102 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.012 |  0.009 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.075 |  0.078 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.105 |  0.131 |  0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.220 |  0.209 |  0.035 | torch.Size([120, 120]) || stage5.linear2.weight
+ | -0.003 | -0.335 |  0.284 |  0.096 | torch.Size([120]) || stage5.linear2.bias
+ | -0.000 | -0.064 |  0.065 |  0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.001 | -0.050 |  0.050 |  0.029 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.119 |  0.106 |  0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ | -0.006 | -0.030 |  0.026 |  0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.055 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.033 |  0.031 |  0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ |  0.001 | -0.060 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ | -0.005 | -0.040 |  0.037 |  0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.038 |  0.051 |  0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.048 |  0.050 |  0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.334 |  0.340 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.037 | -0.050 |  0.294 |  0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ | -0.000 | -0.343 |  0.349 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ | -0.001 | -0.237 |  0.244 |  0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ | -0.000 | -0.575 |  0.591 |  0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.001 | -0.404 |  0.344 |  0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.254 |  1.058 |  1.466 |  0.126 | torch.Size([30]) || stage6.reshape.1.weight
+ | -0.001 | -0.074 |  0.093 |  0.041 | torch.Size([30]) || stage6.reshape.1.bias
+ |  0.000 | -0.734 |  0.625 |  0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ |  0.003 | -0.269 |  0.341 |  0.108 | torch.Size([120]) || stage6.reshape.2.bias
+ |  0.815 |  0.495 |  1.118 |  0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ | -0.071 | -0.291 |  0.263 |  0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.080 |  0.087 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.136 |  0.134 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.061 |  0.037 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.201 |  0.182 |  0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.223 |  0.189 |  0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.184 |  0.211 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.049 |  0.069 |  0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.710 |  0.556 |  0.893 |  0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ | -0.003 | -0.172 |  0.193 |  0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.217 |  0.211 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.041 | -0.158 |  0.025 |  0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.209 |  0.178 |  0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.141 |  0.186 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.245 |  0.347 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.005 | -0.161 |  0.188 |  0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.780 |  0.582 |  0.963 |  0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ | -0.112 | -0.302 |  0.103 |  0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.101 |  0.072 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.112 |  0.178 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.034 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.223 |  0.242 |  0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.149 |  0.105 |  0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.199 |  0.173 |  0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.000 | -0.035 |  0.056 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.744 |  0.530 |  0.917 |  0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.004 | -0.131 |  0.180 |  0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.243 |  0.294 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.039 | -0.217 |  0.045 |  0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.206 |  0.178 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.129 |  0.125 |  0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.236 |  0.276 |  0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.158 |  0.170 |  0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.829 |  0.586 |  1.007 |  0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ | -0.101 | -0.353 |  0.132 |  0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.082 |  0.076 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.154 |  0.143 |  0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.041 |  0.038 |  0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.187 |  0.202 |  0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.096 |  0.127 |  0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.203 |  0.185 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.768 |  0.491 |  0.904 |  0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.001 | -0.146 |  0.159 |  0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.184 |  0.204 |  0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.043 | -0.185 |  0.020 |  0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.188 |  0.270 |  0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.152 |  0.134 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.222 |  0.217 |  0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.141 |  0.144 |  0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.820 |  0.554 |  0.976 |  0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ | -0.091 | -0.336 |  0.137 |  0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.124 |  0.222 |  0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.157 |  0.175 |  0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.049 |  0.049 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.238 |  0.236 |  0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.077 |  0.074 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.212 |  0.265 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.028 |  0.052 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.768 |  0.530 |  0.903 |  0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.002 | -0.104 |  0.157 |  0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.197 |  0.220 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.042 | -0.155 |  0.043 |  0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.166 |  0.199 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.102 |  0.138 |  0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.241 |  0.256 |  0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.123 |  0.115 |  0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.817 |  0.631 |  0.918 |  0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ | -0.082 | -0.295 |  0.141 |  0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.084 |  0.205 |  0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.174 |  0.199 |  0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.060 |  0.081 |  0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.194 |  0.191 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.001 | -0.083 |  0.077 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.218 |  0.243 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.031 |  0.024 |  0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.744 |  0.478 |  0.913 |  0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ | -0.003 | -0.146 |  0.110 |  0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.223 |  0.238 |  0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.046 | -0.200 |  0.071 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.168 |  0.201 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.128 |  0.141 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.220 |  0.205 |  0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.086 |  0.094 |  0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.754 |  0.353 |  0.933 |  0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ | -0.058 | -0.246 |  0.105 |  0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.113 |  0.536 |  0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.261 |  0.224 |  0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.002 | -0.050 |  0.067 |  0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.234 |  0.256 |  0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ |  0.002 | -0.079 |  0.076 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.211 |  0.231 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.033 |  0.030 |  0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.677 |  0.275 |  0.833 |  0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.001 | -0.224 |  0.306 |  0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.196 |  0.211 |  0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.061 | -0.289 |  0.136 |  0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.271 |  0.312 |  0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.003 | -0.166 |  0.155 |  0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.286 |  0.375 |  0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.005 | -0.054 |  0.137 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.174 |  0.172 |  0.039 | torch.Size([120, 120]) || stage6.linear1.weight
+ |  0.002 | -0.275 |  0.348 |  0.113 | torch.Size([120]) || stage6.linear1.bias
+ |  0.704 |  0.402 |  1.002 |  0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.001 | -0.466 |  0.407 |  0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.172 |  0.570 |  0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.337 |  0.378 |  0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.071 |  0.068 |  0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.290 |  0.321 |  0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ |  0.001 | -0.255 |  0.250 |  0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  0.695 |  0.353 |  0.966 |  0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ | -0.001 | -0.218 |  0.165 |  0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.259 |  0.255 |  0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.044 | -0.256 |  0.042 |  0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.234 |  0.214 |  0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.133 |  0.091 |  0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.333 |  0.296 |  0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.238 |  0.280 |  0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.671 |  0.425 |  0.980 |  0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.001 | -0.261 |  0.305 |  0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.372 |  0.942 |  0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.450 |  0.494 |  0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.133 |  0.119 |  0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.239 |  0.288 |  0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.187 |  0.157 |  0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  0.687 |  0.160 |  0.907 |  0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ | -0.002 | -0.192 |  0.222 |  0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.257 |  0.426 |  0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.064 | -0.207 |  0.036 |  0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.269 |  0.224 |  0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.126 |  0.129 |  0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.308 |  0.298 |  0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.180 |  0.192 |  0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.297 |  0.368 |  0.069 | torch.Size([120, 120]) || stage6.linear2.weight
+ |  0.001 | -0.431 |  0.480 |  0.189 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.100 |  0.104 |  0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.001 | -0.018 |  0.029 |  0.010 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.105 |  0.111 |  0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ | -0.007 | -0.033 |  0.024 |  0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.071 |  0.067 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.003 | -0.061 |  0.043 |  0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.074 |  0.068 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.075 |  0.056 |  0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.124 |  0.108 |  0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ | -0.001 | -0.113 |  0.076 |  0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.001 | -0.517 |  0.524 |  0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ |  0.154 | -0.305 |  0.679 |  0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ |  0.000 | -0.680 |  0.728 |  0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ |  0.020 | -0.514 |  0.417 |  0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.587 |  0.737 |  0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.015 | -0.437 |  0.490 |  0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.284 |  1.119 |  1.404 |  0.055 | torch.Size([30]) || stage7.reshape.1.weight
+ | -0.014 | -0.286 |  0.184 |  0.122 | torch.Size([30]) || stage7.reshape.1.bias
+ | -0.000 | -0.521 |  0.576 |  0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ |  0.004 | -0.387 |  0.738 |  0.175 | torch.Size([120]) || stage7.reshape.2.bias
+ |  0.440 |  0.099 |  0.775 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ | -0.177 | -0.670 |  0.319 |  0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.055 | -2.159 |  1.979 |  0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.535 |  0.554 |  0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.193 |  0.281 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.397 |  0.395 |  0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.232 |  0.692 |  0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.899 |  1.073 |  0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.122 |  0.104 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.310 |  0.157 |  0.440 |  0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.006 | -0.474 |  0.266 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.605 |  0.490 |  0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.101 | -0.310 |  0.126 |  0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.448 |  0.475 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.006 | -0.185 |  0.215 |  0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.465 |  0.512 |  0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.150 |  0.417 |  0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.577 |  0.165 |  0.829 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ | -0.136 | -0.849 |  0.206 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.143 | -3.020 |  4.621 |  0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.647 |  0.640 |  0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.356 |  0.382 |  0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.457 |  0.378 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.250 |  0.707 |  0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ | -0.001 | -1.055 |  1.091 |  0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.093 |  0.123 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.411 |  0.265 |  0.535 |  0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.008 | -0.630 |  0.264 |  0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.501 |  0.506 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.087 | -0.341 |  0.140 |  0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.450 |  0.527 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.188 |  0.171 |  0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.554 |  0.546 |  0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.135 |  0.220 |  0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.655 |  0.134 |  0.896 |  0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ | -0.139 | -0.788 |  0.181 |  0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ | -0.062 | -3.469 |  3.276 |  0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.592 |  0.650 |  0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.308 |  0.218 |  0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.355 |  0.345 |  0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.213 |  0.700 |  0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -1.166 |  0.942 |  0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.106 |  0.093 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.466 |  0.317 |  0.565 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.014 | -0.657 |  0.280 |  0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.541 |  0.494 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.079 | -0.335 |  0.122 |  0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.513 |  0.493 |  0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.180 |  0.175 |  0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.509 |  0.479 |  0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.004 | -0.093 |  0.293 |  0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.693 |  0.147 |  0.945 |  0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ | -0.132 | -0.906 |  0.249 |  0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.108 | -3.576 |  4.241 |  0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.945 |  1.095 |  0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.274 |  0.204 |  0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.379 |  0.351 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.211 |  0.587 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -1.269 |  1.067 |  0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.117 |  0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.499 |  0.285 |  0.570 |  0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.012 | -0.567 |  0.273 |  0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.528 |  0.499 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.084 | -0.349 |  0.141 |  0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.547 |  0.592 |  0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.154 |  0.176 |  0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.001 | -0.520 |  0.480 |  0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.150 |  0.207 |  0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.726 |  0.137 |  1.004 |  0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ | -0.122 | -0.907 |  0.180 |  0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ | -0.078 | -3.824 |  4.241 |  0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -1.188 |  0.796 |  0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.002 | -0.248 |  0.207 |  0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.001 | -0.409 |  0.369 |  0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ |  0.002 | -0.224 |  0.322 |  0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -1.744 |  1.273 |  0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.092 |  0.113 |  0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.514 |  0.277 |  0.614 |  0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.016 | -0.621 |  0.286 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.517 |  0.453 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.064 | -0.260 |  0.143 |  0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.554 |  0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.004 | -0.232 |  0.193 |  0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.595 |  0.543 |  0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.196 |  0.198 |  0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.731 |  0.152 |  1.075 |  0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ | -0.076 | -1.003 |  0.176 |  0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ | -0.121 | -3.281 |  4.671 |  0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.640 |  1.083 |  0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.001 | -0.239 |  0.314 |  0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.001 | -0.344 |  0.452 |  0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.004 | -0.361 |  0.251 |  0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.637 |  0.806 |  0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.088 |  0.091 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.514 |  0.238 |  0.594 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.017 | -0.650 |  0.162 |  0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.442 |  0.479 |  0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.040 | -0.400 |  0.203 |  0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.541 |  0.514 |  0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.008 | -0.319 |  0.309 |  0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -1.018 |  1.398 |  0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.606 |  0.269 |  0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.186 |  0.207 |  0.048 | torch.Size([120, 120]) || stage7.linear1.weight
+ |  0.010 | -0.448 |  0.437 |  0.161 | torch.Size([120]) || stage7.linear1.bias
+ |  0.703 |  0.381 |  0.856 |  0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.014 | -0.645 |  0.486 |  0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ | -0.007 | -4.468 |  1.008 |  0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.625 |  0.834 |  0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.009 | -0.737 |  0.632 |  0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.403 |  0.406 |  0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ | -0.007 | -0.338 |  0.165 |  0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  0.435 |  0.323 |  0.526 |  0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.005 | -0.678 |  0.379 |  0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.465 |  0.467 |  0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.031 | -0.236 |  0.180 |  0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.490 |  0.520 |  0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.197 |  0.242 |  0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.525 |  0.501 |  0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.431 |  0.164 |  0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.703 |  0.306 |  0.866 |  0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.009 | -0.647 |  0.481 |  0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ | -0.010 | -3.504 |  1.842 |  0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.639 |  0.590 |  0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.613 |  0.609 |  0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.316 |  0.325 |  0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ | -0.004 | -0.350 |  0.145 |  0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  0.452 |  0.309 |  0.558 |  0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.003 | -0.661 |  0.246 |  0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.580 |  0.410 |  0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.020 | -0.258 |  0.299 |  0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.561 |  0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.234 |  0.434 |  0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.778 |  0.581 |  0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.888 |  0.286 |  0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.348 |  0.237 |  0.060 | torch.Size([120, 120]) || stage7.linear2.weight
+ |  0.023 | -0.390 |  0.506 |  0.167 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.104 |  0.107 |  0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.002 | -0.041 |  0.035 |  0.016 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.123 |  0.109 |  0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.034 |  0.032 |  0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.111 |  0.084 |  0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.073 |  0.081 |  0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ | -0.002 | -0.154 |  0.122 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.014 | -0.041 |  0.068 |  0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ | -0.001 | -0.408 |  0.365 |  0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.057 |  0.054 |  0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.697 |  0.606 |  0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.119 | -0.211 |  0.720 |  0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ |  0.000 | -1.175 |  0.924 |  0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ | -0.000 | -0.581 |  0.580 |  0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ |  0.001 | -0.786 |  0.874 |  0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ | -0.053 | -0.522 |  0.577 |  0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.225 |  1.000 |  1.516 |  0.095 | torch.Size([120]) || stage8.0.1.weight
+ | -0.013 | -0.413 |  0.465 |  0.139 | torch.Size([120]) || stage8.0.1.bias
+ |  0.000 | -2.505 |  0.627 |  0.136 | torch.Size([180, 120]) || stage8.0.2.weight
+ |  0.005 | -0.397 |  0.377 |  0.107 | torch.Size([180]) || stage8.0.2.bias
+ |  0.456 |  0.123 |  0.760 |  0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ | -0.022 | -0.343 |  0.875 |  0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ | -0.014 | -1.907 |  2.592 |  0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.632 |  0.628 |  0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.006 | -0.567 |  0.668 |  0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.477 |  0.447 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ | -0.010 | -0.460 |  0.225 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  0.429 |  0.119 |  0.634 |  0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ | -0.007 | -0.338 |  0.803 |  0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.572 |  0.539 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ | -0.060 | -0.260 |  0.185 |  0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.461 |  0.548 |  0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.163 |  0.183 |  0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.757 |  0.581 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.191 |  0.121 |  0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  0.557 |  0.086 |  0.800 |  0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ | -0.029 | -0.230 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ | -0.016 | -2.004 |  1.711 |  0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.690 |  0.575 |  0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.011 | -0.641 |  0.609 |  0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.466 |  0.401 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ | -0.008 | -0.344 |  0.181 |  0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  0.503 |  0.226 |  0.742 |  0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.404 |  0.818 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.007 | -0.595 |  0.532 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ | -0.068 | -0.261 |  0.071 |  0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.573 |  0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.129 |  0.197 |  0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.556 |  0.582 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.170 |  0.145 |  0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  0.699 |  0.202 |  0.912 |  0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.253 |  0.924 |  0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ | -0.030 | -2.510 |  2.088 |  0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.637 |  0.801 |  0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.006 | -0.512 |  0.520 |  0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.381 |  0.337 |  0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ | -0.011 | -0.238 |  0.234 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  0.594 |  0.150 |  0.810 |  0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ | -0.010 | -0.483 |  0.726 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.567 |  0.499 |  0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ | -0.077 | -0.360 |  0.050 |  0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.536 |  0.673 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.142 |  0.186 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.536 |  0.524 |  0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ | -0.006 | -0.147 |  0.133 |  0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  0.683 |  0.141 |  0.908 |  0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.199 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ | -0.039 | -1.527 |  3.891 |  0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.682 |  0.693 |  0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.007 | -0.543 |  0.513 |  0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.390 |  0.476 |  0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ | -0.007 | -0.176 |  0.150 |  0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  0.640 |  0.094 |  0.853 |  0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ | -0.009 | -0.372 |  0.683 |  0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.006 | -0.628 |  0.521 |  0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ | -0.089 | -0.367 |  0.047 |  0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.629 |  0.562 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.186 |  0.128 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.485 |  0.499 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.138 |  0.209 |  0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.294 |  0.577 |  0.071 | torch.Size([180, 180]) || stage8.1.linear.weight
+ |  0.004 | -0.349 |  0.235 |  0.072 | torch.Size([180]) || stage8.1.linear.bias
+ |  0.708 |  0.242 |  1.026 |  0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ | -0.032 | -0.212 |  0.830 |  0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ | -0.039 | -1.954 |  2.394 |  0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.922 |  0.646 |  0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.429 |  0.524 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.467 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ | -0.005 | -0.339 |  0.264 |  0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  0.587 |  0.255 |  0.837 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ | -0.011 | -0.285 |  0.721 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.586 |  0.534 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.075 | -0.225 |  0.066 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.493 |  0.532 |  0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.189 |  0.178 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.551 |  0.543 |  0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.010 | -0.154 |  0.142 |  0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  0.773 |  0.210 |  1.004 |  0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ | -0.035 | -0.176 |  0.873 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ | -0.027 | -2.407 |  1.736 |  0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.817 |  0.977 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.659 |  0.461 |  0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.484 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ | -0.014 | -0.315 |  0.252 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  0.641 |  0.337 |  0.810 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ | -0.011 | -0.177 |  0.806 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ | -0.006 | -0.569 |  0.598 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.079 | -0.323 |  0.071 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.512 |  0.577 |  0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.142 |  0.161 |  0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.529 |  0.572 |  0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ | -0.010 | -0.178 |  0.159 |  0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  0.857 |  0.199 |  1.153 |  0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ | -0.039 | -0.189 |  0.943 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.042 | -1.962 |  2.773 |  0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.783 |  0.655 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.004 | -0.338 |  0.533 |  0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.497 |  0.461 |  0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ | -0.008 | -0.288 |  0.183 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  0.681 |  0.327 |  0.878 |  0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ | -0.012 | -0.178 |  0.773 |  0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.789 |  0.546 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ | -0.081 | -0.249 |  0.036 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.526 |  0.555 |  0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.133 |  0.191 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.572 |  0.529 |  0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.164 |  0.147 |  0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  0.877 |  0.198 |  1.043 |  0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ | -0.038 | -0.210 |  0.916 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ | -0.094 | -2.974 |  4.987 |  0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.964 |  1.011 |  0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.404 |  0.429 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.501 |  0.489 |  0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ | -0.021 | -0.305 |  0.208 |  0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  0.697 |  0.295 |  0.894 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ | -0.015 | -0.241 |  0.712 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.562 |  0.573 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ | -0.085 | -0.302 |  0.080 |  0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.734 |  0.573 |  0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.150 |  0.161 |  0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.671 |  0.623 |  0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ | -0.023 | -0.252 |  0.317 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.278 |  0.345 |  0.064 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.004 | -0.315 |  0.148 |  0.064 | torch.Size([180]) || stage8.2.linear.bias
+ |  0.850 |  0.326 |  1.087 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ | -0.031 | -0.334 |  0.779 |  0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.012 | -2.917 |  1.476 |  0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.603 |  0.666 |  0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.374 |  0.381 |  0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.577 |  0.605 |  0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ | -0.008 | -0.394 |  0.499 |  0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  0.636 |  0.321 |  0.790 |  0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ | -0.013 | -0.294 |  0.774 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ | -0.004 | -0.540 |  0.539 |  0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ | -0.065 | -0.212 |  0.047 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.608 |  0.603 |  0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.177 |  0.155 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.573 |  0.630 |  0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.189 |  0.178 |  0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  0.899 |  0.275 |  1.048 |  0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ | -0.031 | -0.223 |  0.771 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ | -0.003 | -3.151 |  1.718 |  0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.732 |  0.868 |  0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.412 |  0.350 |  0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.466 |  0.487 |  0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ | -0.006 | -0.388 |  0.400 |  0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  0.711 |  0.381 |  0.864 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.240 |  0.692 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ | -0.005 | -0.657 |  0.639 |  0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.263 |  0.047 |  0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.673 |  0.605 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.158 |  0.155 |  0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.582 |  0.585 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.009 | -0.253 |  0.178 |  0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  0.941 |  0.262 |  1.154 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ | -0.032 | -0.162 |  0.906 |  0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ | -0.005 | -3.421 |  1.350 |  0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.777 |  0.735 |  0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.355 |  0.421 |  0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.479 |  0.475 |  0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ | -0.013 | -0.292 |  0.345 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  0.743 |  0.242 |  0.919 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ | -0.011 | -0.214 |  0.691 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.005 | -0.633 |  0.498 |  0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ | -0.082 | -0.346 |  0.087 |  0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.591 |  0.670 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.190 |  0.151 |  0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.560 |  0.637 |  0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ | -0.009 | -0.226 |  0.250 |  0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  0.950 |  0.250 |  1.103 |  0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.196 |  0.925 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ | -0.026 | -3.591 |  5.653 |  0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.753 |  0.637 |  0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.333 |  0.432 |  0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.001 | -0.591 |  0.591 |  0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ | -0.014 | -0.348 |  0.267 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  0.735 |  0.254 |  0.893 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ | -0.011 | -0.241 |  0.659 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.628 |  0.667 |  0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ | -0.076 | -0.411 |  0.113 |  0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.662 |  0.578 |  0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ | -0.004 | -0.208 |  0.169 |  0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.602 |  0.588 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ | -0.011 | -0.218 |  0.232 |  0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.343 |  0.316 |  0.065 | torch.Size([180, 180]) || stage8.3.linear.weight
+ |  0.010 | -0.297 |  0.187 |  0.061 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.012 |  0.330 |  1.282 |  0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ | -0.030 | -0.347 |  0.800 |  0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ | -0.013 | -2.816 |  3.792 |  0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.807 |  0.825 |  0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.429 |  0.319 |  0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.553 |  0.569 |  0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ | -0.019 | -0.443 |  0.441 |  0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  0.638 |  0.420 |  0.797 |  0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ | -0.018 | -0.222 |  0.886 |  0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.576 |  0.510 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ | -0.018 | -0.277 |  0.123 |  0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.687 |  0.625 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ | -0.007 | -0.264 |  0.267 |  0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.639 |  0.705 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.012 | -0.255 |  0.274 |  0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.092 |  0.475 |  1.341 |  0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ | -0.030 | -0.294 |  0.686 |  0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.018 | -3.165 |  0.990 |  0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.695 |  0.699 |  0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.319 |  0.286 |  0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.542 |  0.519 |  0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ | -0.017 | -0.439 |  0.451 |  0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  0.664 |  0.366 |  0.835 |  0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ | -0.015 | -0.217 |  0.985 |  0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.641 |  0.563 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ | -0.022 | -0.381 |  0.161 |  0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.571 |  0.642 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.279 |  0.311 |  0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.738 |  0.633 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ | -0.007 | -0.254 |  0.261 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.125 |  0.525 |  1.405 |  0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.186 |  0.627 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ |  0.028 | -3.477 |  0.957 |  0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.663 |  0.658 |  0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.007 | -0.357 |  0.255 |  0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.596 |  0.578 |  0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ | -0.018 | -0.506 |  0.389 |  0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  0.694 |  0.319 |  0.865 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ | -0.018 | -0.150 |  0.975 |  0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.619 |  0.565 |  0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ | -0.025 | -0.345 |  0.208 |  0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.624 |  0.607 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.388 |  0.290 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.927 |  0.675 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.325 |  0.240 |  0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.108 |  0.535 |  1.297 |  0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.213 |  0.546 |  0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ |  0.020 | -3.042 |  1.420 |  0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.697 |  0.700 |  0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.311 |  0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.652 |  0.592 |  0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ | -0.019 | -0.535 |  0.426 |  0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  0.685 |  0.225 |  0.893 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.211 |  0.938 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ | -0.001 | -0.501 |  0.564 |  0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ | -0.014 | -0.339 |  0.237 |  0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.560 |  0.626 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.231 |  0.239 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.544 |  0.657 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.271 |  0.274 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.473 |  0.481 |  0.069 | torch.Size([180, 180]) || stage8.4.linear.weight
+ |  0.029 | -0.333 |  0.194 |  0.076 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.025 |  0.297 |  1.336 |  0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ | -0.034 | -0.429 |  0.872 |  0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.574 | -4.515 |  3.381 |  0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.771 |  0.886 |  0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.356 |  0.521 |  0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.632 |  0.656 |  0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ | -0.029 | -0.329 |  0.697 |  0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  0.777 |  0.446 |  0.952 |  0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ | -0.022 | -0.335 |  0.920 |  0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.520 |  0.598 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.013 | -0.456 |  0.200 |  0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.677 |  0.642 |  0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.272 |  0.233 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.762 |  0.598 |  0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.025 | -0.244 |  0.583 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.021 |  0.261 |  1.261 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ | -0.033 | -0.358 |  0.867 |  0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ | -0.550 | -3.274 |  4.406 |  0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.819 |  0.986 |  0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.005 | -0.510 |  0.446 |  0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.003 | -0.739 |  0.682 |  0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ | -0.032 | -0.318 |  0.607 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  0.823 |  0.420 |  0.950 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ | -0.021 | -0.274 |  0.882 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.496 |  0.532 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ | -0.028 | -0.260 |  0.194 |  0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.620 |  0.586 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.284 |  0.423 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.774 |  0.614 |  0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ | -0.028 | -0.371 |  0.561 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.096 |  0.377 |  1.321 |  0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.244 |  0.755 |  0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.441 | -3.439 |  5.870 |  0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.710 |  0.679 |  0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.277 |  0.283 |  0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.001 | -0.824 |  0.684 |  0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.033 | -0.390 |  0.545 |  0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  0.843 |  0.390 |  0.984 |  0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ | -0.022 | -0.211 |  0.854 |  0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.522 |  0.503 |  0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ | -0.024 | -0.243 |  0.219 |  0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ | -0.001 | -0.638 |  0.617 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.268 |  0.380 |  0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.713 |  0.769 |  0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ | -0.034 | -0.372 |  0.592 |  0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.027 |  0.318 |  1.206 |  0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.187 |  0.768 |  0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ | -0.347 | -2.664 |  2.684 |  0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.677 |  0.676 |  0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.002 | -0.410 |  0.354 |  0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.630 |  0.725 |  0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ | -0.041 | -0.385 |  0.660 |  0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  0.849 |  0.390 |  0.985 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.163 |  0.810 |  0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ | -0.002 | -0.547 |  0.536 |  0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ | -0.012 | -0.366 |  0.252 |  0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.669 |  0.597 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.216 |  0.202 |  0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.700 |  0.674 |  0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ | -0.032 | -0.376 |  0.666 |  0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.299 |  0.469 |  0.069 | torch.Size([180, 180]) || stage8.5.linear.weight
+ |  0.081 | -0.562 |  0.263 |  0.109 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.111 |  0.208 |  1.434 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ | -0.048 | -0.547 |  0.851 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ | -0.252 | -2.157 |  6.293 |  0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.664 |  0.631 |  0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.007 | -0.293 |  0.366 |  0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.701 |  0.726 |  0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ |  0.030 | -0.318 |  0.331 |  0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  0.959 |  0.475 |  1.322 |  0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ | -0.039 | -0.421 |  0.873 |  0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.550 |  0.783 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.269 |  0.152 |  0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.914 |  0.839 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.340 |  0.304 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.592 |  0.713 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.535 |  0.384 |  0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.123 |  0.183 |  1.352 |  0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ | -0.047 | -0.513 |  0.903 |  0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ | -0.234 | -1.968 |  6.366 |  0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.751 |  0.759 |  0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.300 |  0.214 |  0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.657 |  0.699 |  0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ |  0.031 | -0.321 |  0.293 |  0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  0.986 |  0.416 |  1.360 |  0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ | -0.038 | -0.393 |  0.807 |  0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ | -0.001 | -0.589 |  0.620 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.316 |  0.229 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.738 |  0.766 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.252 |  0.302 |  0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.674 |  0.629 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.475 |  0.441 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.097 |  0.342 |  1.294 |  0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ | -0.054 | -0.639 |  0.904 |  0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ | -0.135 | -3.252 |  1.238 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.672 |  0.663 |  0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.007 | -0.170 |  0.228 |  0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.001 | -0.660 |  0.651 |  0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.031 | -0.360 |  0.322 |  0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.004 |  0.360 |  1.381 |  0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ | -0.042 | -0.447 |  0.808 |  0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.600 |  0.603 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.022 | -0.447 |  0.249 |  0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.666 |  0.708 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.326 |  0.272 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.653 |  0.719 |  0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.488 |  0.321 |  0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.095 |  0.272 |  1.302 |  0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ | -0.052 | -0.557 |  1.069 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ | -0.196 | -2.349 |  1.401 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.741 |  0.657 |  0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.186 |  0.141 |  0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.669 |  0.671 |  0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.004 | -0.323 |  0.300 |  0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  0.999 |  0.383 |  1.380 |  0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ | -0.044 | -0.392 |  0.694 |  0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.577 |  0.857 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.041 | -0.394 |  0.238 |  0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.924 |  0.828 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.214 |  0.407 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.827 |  0.755 |  0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.022 | -0.296 |  0.262 |  0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ |  0.002 | -1.059 |  1.262 |  0.089 | torch.Size([180, 180]) || stage8.6.linear.weight
+ |  0.031 | -0.789 |  0.427 |  0.120 | torch.Size([180]) || stage8.6.linear.bias
+ |  0.389 |  0.079 |  1.137 |  0.176 | torch.Size([180]) || norm.weight
+ | -0.021 | -0.669 |  0.888 |  0.127 | torch.Size([180]) || norm.bias
+ |  0.000 | -0.486 |  0.568 |  0.103 | torch.Size([120, 180]) || conv_after_body.weight
+ | -0.000 | -0.167 |  0.168 |  0.055 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -1.782 |  1.300 |  0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ | -0.019 | -0.542 |  0.437 |  0.162 | torch.Size([64]) || conv_before_upsample.0.bias
+ |  0.001 | -1.915 |  1.372 |  0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ | -0.045 | -0.281 |  0.215 |  0.097 | torch.Size([256]) || upsample.0.bias
+ | -0.006 | -4.826 |  0.582 |  0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ | -0.154 | -0.441 |  0.187 |  0.100 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.210 |  0.246 |  0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.000 | -0.013 |  0.007 |  0.003 | torch.Size([64]) || upsample.10.bias
+ |  0.000 | -0.044 |  0.042 |  0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([3]) || conv_last.bias
+
+22-03-11 10:46:12.537 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/train/train_sharp
+      dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:46:12.583 : Number of train images: 27,000, iters: 3,375
+22-03-11 10:46:26.822 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 10:46:27.000 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ | -0.000 | -1.462 |  1.580 |  0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ |  0.005 | -0.950 |  0.885 |  0.268 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.679 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.042 | -0.894 |  0.351 |  0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.008 | -3.201 |  0.948 |  0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.059 | -1.268 |  0.732 |  0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.633 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.159 | -0.704 |  0.859 |  0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.024 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.780 | -1.061 |  1.162 |  0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.144 |  0.163 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ |  0.001 | -0.003 |  0.005 |  0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ |  0.000 | -0.726 |  0.773 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.021 | -0.814 |  0.355 |  0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.010 | -3.380 |  0.916 |  0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.038 | -1.207 |  0.714 |  0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.462 |  0.549 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.157 | -0.742 |  0.980 |  0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.020 | -1.648 |  0.319 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.775 | -1.195 |  1.148 |  0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ | -0.000 | -0.122 |  0.152 |  0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ | -0.000 | -0.002 |  0.001 |  0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -0.956 |  0.870 |  0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.025 | -1.040 |  0.512 |  0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.011 | -4.624 |  1.195 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.023 | -1.284 |  0.699 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.009 | -1.831 |  0.616 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.120 | -0.695 |  0.755 |  0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.013 | -1.285 |  0.304 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.681 | -1.725 |  0.942 |  0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.045 |  0.071 |  0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.010 | -0.010 | -0.009 |  0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ | -0.000 | -0.995 |  0.879 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.040 | -1.137 |  0.617 |  0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.010 | -4.891 |  1.224 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.022 | -1.287 |  0.745 |  0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.010 | -1.802 |  0.561 |  0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.118 | -0.694 |  0.697 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.012 | -1.107 |  0.306 |  0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.658 | -1.792 |  0.905 |  0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.030 |  0.037 |  0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.003 | -0.001 |  0.007 |  0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -0.990 |  0.880 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ | -0.010 | -1.067 |  0.596 |  0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.010 | -5.061 |  1.229 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.024 | -1.274 |  0.830 |  0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.009 | -1.787 |  0.563 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.130 | -0.685 |  0.743 |  0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.011 | -0.973 |  0.292 |  0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.659 | -1.855 |  0.931 |  0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.034 |  0.040 |  0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.001 | -0.009 |  0.007 |  0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -0.973 |  0.853 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.022 | -1.001 |  0.571 |  0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.009 | -5.095 |  1.251 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.026 | -1.305 |  0.880 |  0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.815 |  0.561 |  0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.137 | -0.711 |  0.771 |  0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.010 | -0.986 |  0.286 |  0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.671 | -1.913 |  0.966 |  0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.034 |  0.028 |  0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ |  0.002 | -0.013 |  0.016 |  0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.280 |  0.669 |  1.862 |  0.274 | torch.Size([120]) || stage1.reshape.1.weight
+ | -0.006 | -0.324 |  0.337 |  0.106 | torch.Size([120]) || stage1.reshape.1.bias
+ |  0.579 |  0.129 |  1.064 |  0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ | -0.039 | -1.100 |  0.894 |  0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ | -0.134 | -4.020 |  2.585 |  0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.579 |  0.618 |  0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.319 |  0.279 |  0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.634 |  0.686 |  0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ | -0.014 | -0.222 |  0.642 |  0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -1.066 |  0.928 |  0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.146 |  0.190 |  0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.781 |  0.367 |  1.203 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.029 | -0.378 |  0.545 |  0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.687 |  0.753 |  0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.010 | -0.229 |  0.633 |  0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.674 |  0.669 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.011 | -0.448 |  0.368 |  0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.862 |  0.941 |  0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.267 |  0.594 |  0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.797 |  0.211 |  1.475 |  0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ | -0.161 | -1.941 |  0.746 |  0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ | -0.296 | -3.927 |  2.840 |  0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.001 | -1.479 |  1.395 |  0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.381 |  0.258 |  0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.526 |  0.561 |  0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.178 |  0.478 |  0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -1.242 |  1.138 |  0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.004 | -0.213 |  0.196 |  0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.702 |  0.349 |  0.904 |  0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.039 | -0.646 |  0.384 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.872 |  0.750 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.049 | -0.353 |  0.135 |  0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.562 |  0.580 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.238 |  0.457 |  0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.828 |  0.685 |  0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.031 | -0.297 |  0.419 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.984 |  0.163 |  1.398 |  0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ | -0.167 | -1.609 |  0.367 |  0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.343 | -4.484 |  2.362 |  0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -1.586 |  1.649 |  0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.240 |  0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.378 |  0.514 |  0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ | -0.009 | -0.143 |  0.172 |  0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ |  0.001 | -0.639 |  0.582 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.173 |  0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.733 |  0.277 |  0.903 |  0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.038 | -0.861 |  0.359 |  0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.787 |  0.679 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.029 | -0.365 |  0.143 |  0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.574 |  0.539 |  0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.283 |  0.254 |  0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.001 | -0.998 |  0.522 |  0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.030 | -0.169 |  0.293 |  0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.035 |  0.143 |  1.397 |  0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ | -0.161 | -1.413 |  0.084 |  0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ | -0.441 | -4.685 |  3.306 |  0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -1.590 |  1.329 |  0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.266 |  0.232 |  0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.366 |  0.372 |  0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.011 | -0.225 |  0.171 |  0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.660 |  0.801 |  0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.139 |  0.200 |  0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.724 |  0.190 |  0.911 |  0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.038 | -0.981 |  0.285 |  0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.611 |  0.598 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.035 | -0.299 |  0.221 |  0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.502 |  0.520 |  0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.271 |  0.215 |  0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.558 |  0.898 |  0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.010 | -0.424 |  0.190 |  0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.085 |  0.169 |  1.400 |  0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ | -0.086 | -1.613 |  0.150 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ | -0.541 | -3.902 |  3.728 |  0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ |  0.001 | -1.879 |  1.832 |  0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.001 | -0.391 |  0.444 |  0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.407 |  0.448 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ | -0.013 | -0.302 |  0.342 |  0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ | -0.001 | -0.830 |  0.863 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.117 |  0.094 |  0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.704 |  0.195 |  0.870 |  0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.031 | -1.069 |  0.276 |  0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.656 |  0.555 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.029 | -0.387 |  0.256 |  0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.001 | -0.590 |  0.624 |  0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.011 | -0.277 |  0.303 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -1.124 |  0.539 |  0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.006 | -0.718 |  0.133 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.037 |  0.176 |  1.327 |  0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ | -0.112 | -1.591 |  0.177 |  0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ | -0.438 | -2.229 |  2.797 |  0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -2.212 |  1.826 |  0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.343 |  0.338 |  0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.367 |  0.451 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ | -0.022 | -0.358 |  0.242 |  0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ |  0.001 | -0.922 |  0.886 |  0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.002 | -0.083 |  0.089 |  0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.662 |  0.277 |  0.831 |  0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.025 | -0.959 |  0.261 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ | -0.001 | -0.636 |  0.739 |  0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.030 | -0.419 |  0.517 |  0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.615 |  0.709 |  0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.002 | -0.230 |  0.457 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.001 | -1.724 |  1.186 |  0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.019 | -1.909 |  0.255 |  0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.242 |  0.244 |  0.057 | torch.Size([120, 120]) || stage1.linear1.weight
+ |  0.004 | -0.221 |  0.224 |  0.083 | torch.Size([120]) || stage1.linear1.bias
+ |  0.737 |  0.334 |  1.046 |  0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.013 | -0.911 |  0.763 |  0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ | -0.052 | -2.462 |  2.040 |  0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.785 |  0.767 |  0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.009 | -0.466 |  0.552 |  0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.431 |  0.475 |  0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ | -0.009 | -0.796 |  0.497 |  0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  0.573 |  0.409 |  0.935 |  0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.015 | -0.828 |  0.839 |  0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.604 |  0.542 |  0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.037 | -0.179 |  0.273 |  0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.666 |  0.553 |  0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.416 |  0.396 |  0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.654 |  0.538 |  0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.470 |  0.310 |  0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.951 |  0.342 |  1.189 |  0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.010 | -0.697 |  0.802 |  0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ | -0.098 | -2.648 |  2.410 |  0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.733 |  0.886 |  0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.468 |  0.550 |  0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.435 |  0.377 |  0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.359 |  0.258 |  0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  0.582 |  0.305 |  0.717 |  0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.008 | -0.714 |  0.833 |  0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.001 | -0.732 |  0.501 |  0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.306 |  0.267 |  0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.510 |  0.533 |  0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.315 |  0.291 |  0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.736 |  0.789 |  0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -1.274 |  1.328 |  0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.390 |  0.303 |  0.069 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.010 | -0.219 |  0.227 |  0.087 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.095 |  0.106 |  0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ | -0.001 | -0.036 |  0.036 |  0.013 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.136 |  0.141 |  0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.028 |  0.024 |  0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.156 |  0.104 |  0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.055 |  0.045 |  0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.098 |  0.106 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ | -0.000 | -0.081 |  0.070 |  0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.375 |  0.279 |  0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.074 |  0.070 |  0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.776 |  0.733 |  0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ |  0.021 | -0.239 |  0.513 |  0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ |  0.001 | -1.100 |  1.143 |  0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ |  0.008 | -0.405 |  0.393 |  0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ |  0.000 | -0.963 |  0.899 |  0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ | -0.055 | -0.616 |  0.599 |  0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.149 |  0.345 |  1.921 |  0.289 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.017 | -0.502 |  0.663 |  0.141 | torch.Size([480]) || stage2.reshape.1.bias
+ | -0.000 | -0.609 |  0.736 |  0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ |  0.006 | -0.136 |  0.404 |  0.077 | torch.Size([120]) || stage2.reshape.2.bias
+ |  0.686 |  0.172 |  1.113 |  0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ | -0.154 | -0.926 |  0.339 |  0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ | -0.120 | -1.869 |  4.616 |  0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.514 |  0.499 |  0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.214 |  0.177 |  0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.499 |  0.529 |  0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ | -0.004 | -0.171 |  0.556 |  0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.642 |  0.598 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.125 |  0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.592 |  0.325 |  0.794 |  0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.008 | -0.649 |  0.445 |  0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.485 |  0.457 |  0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.053 | -0.240 |  0.171 |  0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.462 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.177 |  0.268 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.690 |  0.498 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.007 | -0.270 |  0.472 |  0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.864 |  0.187 |  1.221 |  0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ | -0.146 | -1.128 |  0.299 |  0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.241 | -1.607 |  8.958 |  0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.561 |  0.538 |  0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.198 |  0.222 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.475 |  0.479 |  0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ | -0.006 | -0.295 |  0.341 |  0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -0.961 |  0.789 |  0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.105 |  0.143 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.653 |  0.401 |  0.810 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.009 | -0.767 |  0.367 |  0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.486 |  0.499 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.056 | -0.185 |  0.147 |  0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.548 |  0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.231 |  0.177 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.578 |  0.609 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.350 |  0.216 |  0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.848 |  0.172 |  1.107 |  0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ | -0.168 | -1.123 |  0.330 |  0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ | -0.074 | -1.239 |  4.293 |  0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ | -0.001 | -0.643 |  0.531 |  0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.220 |  0.376 |  0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.529 |  0.479 |  0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.230 |  0.295 |  0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -0.726 |  0.768 |  0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.167 |  0.193 |  0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.695 |  0.334 |  0.833 |  0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.012 | -0.755 |  0.517 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.474 |  0.480 |  0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.049 | -0.218 |  0.148 |  0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.542 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.006 | -0.245 |  0.239 |  0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.541 |  0.485 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.318 |  0.170 |  0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.903 |  0.178 |  1.124 |  0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ | -0.138 | -1.223 |  0.440 |  0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.164 | -1.383 |  5.910 |  0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.526 |  0.496 |  0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.250 |  0.273 |  0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.447 |  0.524 |  0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.243 |  0.256 |  0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.551 |  0.730 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.145 |  0.126 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.707 |  0.319 |  0.855 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.013 | -0.839 |  0.507 |  0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.509 |  0.508 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.051 | -0.219 |  0.155 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.475 |  0.592 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.162 |  0.220 |  0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.465 |  0.528 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.243 |  0.286 |  0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.948 |  0.220 |  1.175 |  0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ | -0.125 | -1.093 |  0.385 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ | -0.150 | -1.632 |  4.522 |  0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.636 |  0.543 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.254 |  0.262 |  0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.001 | -0.632 |  0.628 |  0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ | -0.005 | -0.240 |  0.330 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.476 |  0.479 |  0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.112 |  0.134 |  0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.686 |  0.264 |  0.797 |  0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.012 | -0.889 |  0.427 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.476 |  0.478 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.051 | -0.267 |  0.180 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.506 |  0.517 |  0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.172 |  0.241 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.570 |  0.542 |  0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.003 | -0.631 |  0.395 |  0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.912 |  0.189 |  1.122 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ | -0.114 | -1.125 |  0.188 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ | -0.099 | -1.285 |  1.708 |  0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.496 |  0.540 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.003 | -0.260 |  0.228 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.511 |  0.454 |  0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.711 |  0.286 |  0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.444 |  0.454 |  0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.101 |  0.133 |  0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.668 |  0.312 |  0.800 |  0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.015 | -0.778 |  0.372 |  0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.485 |  0.469 |  0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.045 | -0.294 |  0.173 |  0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.554 |  0.540 |  0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.001 | -0.183 |  0.199 |  0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.879 |  0.824 |  0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.670 |  0.358 |  0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.253 |  0.346 |  0.068 | torch.Size([120, 120]) || stage2.linear1.weight
+ |  0.007 | -0.248 |  0.241 |  0.103 | torch.Size([120]) || stage2.linear1.bias
+ |  1.012 |  0.613 |  1.327 |  0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.019 | -0.724 |  0.685 |  0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ |  0.003 | -2.959 |  1.705 |  0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.636 |  0.617 |  0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.291 |  0.292 |  0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.002 | -0.476 |  0.512 |  0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.263 |  0.398 |  0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  0.677 |  0.521 |  0.840 |  0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.010 | -0.710 |  0.541 |  0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.540 |  0.507 |  0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.016 | -0.242 |  0.201 |  0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.519 |  0.479 |  0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.162 |  0.231 |  0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.449 |  0.494 |  0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.293 |  0.222 |  0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.053 |  0.832 |  1.269 |  0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.015 | -0.549 |  0.428 |  0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ |  0.007 | -3.099 |  1.550 |  0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.673 |  0.604 |  0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.416 |  0.391 |  0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.569 |  0.560 |  0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ |  0.004 | -0.613 |  0.428 |  0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  0.762 |  0.464 |  0.954 |  0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.005 | -0.745 |  0.381 |  0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.441 |  0.448 |  0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.019 | -0.292 |  0.460 |  0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.491 |  0.490 |  0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.285 |  0.177 |  0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.535 |  0.631 |  0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.011 | -0.765 |  0.337 |  0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.001 | -0.367 |  0.372 |  0.074 | torch.Size([120, 120]) || stage2.linear2.weight
+ |  0.009 | -0.288 |  0.342 |  0.130 | torch.Size([120]) || stage2.linear2.bias
+ |  0.000 | -0.112 |  0.093 |  0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ | -0.002 | -0.036 |  0.035 |  0.016 | torch.Size([120]) || stage2.pa_deform.bias
+ |  0.000 | -0.068 |  0.080 |  0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ | -0.009 | -0.035 |  0.023 |  0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.068 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ | -0.014 | -0.061 |  0.036 |  0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.082 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ | -0.003 | -0.075 |  0.069 |  0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.166 |  0.139 |  0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ | -0.015 | -0.090 |  0.050 |  0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.642 |  0.663 |  0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ |  0.130 | -0.171 |  0.480 |  0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.696 |  0.620 |  0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.007 | -0.337 |  0.301 |  0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ |  0.000 | -0.650 |  0.657 |  0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ |  0.013 | -0.507 |  0.451 |  0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.067 |  0.372 |  1.778 |  0.269 | torch.Size([480]) || stage3.reshape.1.weight
+ | -0.004 | -0.699 |  0.521 |  0.227 | torch.Size([480]) || stage3.reshape.1.bias
+ | -0.000 | -0.643 |  0.743 |  0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.009 | -0.176 |  0.243 |  0.079 | torch.Size([120]) || stage3.reshape.2.bias
+ |  0.785 |  0.469 |  1.029 |  0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ | -0.102 | -0.716 |  0.311 |  0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ | -0.001 | -0.340 |  0.163 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.328 |  0.302 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.004 | -0.232 |  0.189 |  0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.343 |  0.346 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ |  0.004 | -0.335 |  0.229 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.366 |  0.325 |  0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.074 |  0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.751 |  0.517 |  0.928 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.002 | -0.271 |  0.189 |  0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.371 |  0.388 |  0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.073 | -0.203 |  0.039 |  0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.400 |  0.401 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.178 |  0.128 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.410 |  0.429 |  0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.006 | -0.345 |  0.304 |  0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.816 |  0.469 |  1.015 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ | -0.103 | -0.647 |  0.225 |  0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ |  0.001 | -0.464 |  0.239 |  0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.304 |  0.359 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.173 |  0.193 |  0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.299 |  0.408 |  0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.007 | -0.511 |  0.239 |  0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.288 |  0.254 |  0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.060 |  0.054 |  0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.796 |  0.609 |  0.971 |  0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ | -0.002 | -0.327 |  0.247 |  0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.379 |  0.407 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.214 |  0.034 |  0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.391 |  0.432 |  0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.176 |  0.112 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.378 |  0.399 |  0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.009 | -0.410 |  0.306 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.854 |  0.447 |  0.995 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ | -0.086 | -0.513 |  0.198 |  0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.001 | -0.189 |  0.292 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.390 |  0.367 |  0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.310 |  0.284 |  0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.334 |  0.296 |  0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.356 |  0.299 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.276 |  0.315 |  0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.094 |  0.066 |  0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.829 |  0.673 |  1.017 |  0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.003 | -0.259 |  0.228 |  0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.410 |  0.385 |  0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.085 | -0.200 |  0.017 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.348 |  0.378 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.130 |  0.105 |  0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.346 |  0.425 |  0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.005 | -0.363 |  0.241 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.872 |  0.554 |  1.068 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ | -0.057 | -0.402 |  0.133 |  0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ |  0.003 | -0.365 |  0.217 |  0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.359 |  0.357 |  0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.265 |  0.294 |  0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.300 |  0.271 |  0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.316 |  0.215 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.370 |  0.329 |  0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.056 |  0.066 |  0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.842 |  0.631 |  0.989 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ | -0.001 | -0.216 |  0.263 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.388 |  0.391 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.087 | -0.202 |  0.032 |  0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.364 |  0.428 |  0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.137 |  0.106 |  0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.001 | -0.390 |  0.339 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.376 |  0.203 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.913 |  0.498 |  1.102 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ | -0.048 | -0.340 |  0.105 |  0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ |  0.001 | -0.706 |  0.306 |  0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.373 |  0.339 |  0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.004 | -0.301 |  0.301 |  0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.278 |  0.277 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ |  0.003 | -0.310 |  0.240 |  0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.350 |  0.322 |  0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.064 |  0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.862 |  0.679 |  0.990 |  0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ | -0.004 | -0.313 |  0.190 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.370 |  0.364 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.092 | -0.231 |  0.129 |  0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.375 |  0.511 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.114 |  0.114 |  0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.389 |  0.354 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.005 | -0.258 |  0.164 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.899 |  0.480 |  1.089 |  0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ | -0.030 | -0.257 |  0.115 |  0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ |  0.003 | -0.462 |  0.290 |  0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.391 |  0.365 |  0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.004 | -0.232 |  0.302 |  0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.267 |  0.293 |  0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.250 |  0.182 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.238 |  0.257 |  0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.032 |  0.033 |  0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.864 |  0.651 |  1.029 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ | -0.003 | -0.212 |  0.175 |  0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.378 |  0.379 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.097 | -0.308 |  0.026 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.578 |  0.401 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.166 |  0.131 |  0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.358 |  0.376 |  0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -0.262 |  0.176 |  0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.003 | -0.284 |  0.467 |  0.071 | torch.Size([120, 120]) || stage3.linear1.weight
+ |  0.006 | -0.201 |  0.269 |  0.090 | torch.Size([120]) || stage3.linear1.bias
+ |  0.877 |  0.568 |  1.197 |  0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.002 | -0.248 |  0.324 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.261 |  0.125 |  0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.563 |  0.552 |  0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.257 |  0.302 |  0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.390 |  0.385 |  0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.450 |  0.235 |  0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  0.986 |  0.755 |  1.165 |  0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.260 |  0.169 |  0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.355 |  0.397 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.046 | -0.220 |  0.086 |  0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.424 |  0.368 |  0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.111 |  0.122 |  0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.354 |  0.374 |  0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.374 |  0.272 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.919 |  0.643 |  1.132 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.177 |  0.181 |  0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.332 |  0.131 |  0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.418 |  0.362 |  0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.375 |  0.347 |  0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.294 |  0.354 |  0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ |  0.003 | -0.432 |  0.259 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.012 |  0.750 |  1.178 |  0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ | -0.001 | -0.171 |  0.155 |  0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.331 |  0.356 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.035 | -0.207 |  0.197 |  0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.399 |  0.398 |  0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.111 |  0.129 |  0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.353 |  0.330 |  0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.328 |  0.127 |  0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.003 | -0.289 |  0.519 |  0.073 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.002 | -0.318 |  0.371 |  0.144 | torch.Size([120]) || stage3.linear2.bias
+ | -0.000 | -0.086 |  0.095 |  0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ | -0.002 | -0.023 |  0.021 |  0.010 | torch.Size([120]) || stage3.pa_deform.bias
+ | -0.000 | -0.060 |  0.056 |  0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ | -0.008 | -0.035 |  0.019 |  0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.064 |  0.062 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ | -0.007 | -0.044 |  0.031 |  0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.062 |  0.063 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ | -0.006 | -0.052 |  0.043 |  0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.081 |  0.080 |  0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ | -0.004 | -0.087 |  0.083 |  0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.465 |  0.513 |  0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ |  0.059 | -0.251 |  0.595 |  0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ | -0.000 | -0.544 |  0.531 |  0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.001 | -0.589 |  0.433 |  0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ | -0.000 | -0.535 |  0.562 |  0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ | -0.001 | -0.401 |  0.342 |  0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  0.997 |  0.921 |  1.125 |  0.028 | torch.Size([480]) || stage4.reshape.1.weight
+ | -0.000 | -0.058 |  0.059 |  0.022 | torch.Size([480]) || stage4.reshape.1.bias
+ |  0.000 | -0.155 |  0.150 |  0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ |  0.001 | -0.016 |  0.016 |  0.006 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.002 |  0.999 |  1.009 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.071 |  0.066 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.093 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.009 |  0.009 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.097 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.035 |  0.027 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.080 |  0.079 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.087 |  0.092 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.080 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.031 |  0.029 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.002 |  0.997 |  1.007 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.066 |  0.065 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.080 |  0.083 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ | -0.000 | -0.027 |  0.029 |  0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.077 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.006 |  0.009 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.080 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.084 |  0.075 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.034 |  0.031 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.996 |  1.008 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ |  0.001 | -0.070 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.007 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ | -0.000 | -0.023 |  0.026 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.107 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.000 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.076 |  0.077 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.005 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -2.000 |  0.081 |  0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.084 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.027 |  0.024 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.999 |  1.012 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.064 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.099 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.083 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ | -0.000 | -0.019 |  0.018 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.079 |  0.084 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.087 |  0.076 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.079 |  0.082 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.011 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ | -0.001 | -0.004 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.089 |  0.081 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.080 |  0.085 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.000 | -0.021 |  0.016 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.082 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.086 |  0.080 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.084 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.081 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.018 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.003 |  0.997 |  1.014 |  0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ | -0.001 | -0.005 |  0.004 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ | -0.001 | -0.070 |  0.069 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.097 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.089 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.016 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.083 |  0.091 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.093 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.000 | -0.002 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.086 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.079 |  0.092 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.000 | -0.012 |  0.016 |  0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.090 |  0.111 |  0.024 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.001 | -0.019 |  0.029 |  0.009 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  0.999 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.084 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.079 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ |  0.000 | -0.021 |  0.024 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.072 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.102 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.024 |  0.020 |  0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.998 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.078 |  0.096 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.020 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.085 |  0.082 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.000 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.078 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.092 |  0.112 |  0.023 | torch.Size([120, 120]) || stage4.linear2.weight
+ |  0.000 | -0.032 |  0.049 |  0.015 | torch.Size([120]) || stage4.linear2.bias
+ |  0.000 | -0.036 |  0.037 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.022 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ |  0.002 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ | -0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.003 |  0.002 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.172 |  0.177 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ |  0.002 | -0.027 |  0.088 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ |  0.000 | -0.212 |  0.163 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ |  0.000 | -0.066 |  0.081 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ |  0.000 | -0.413 |  0.387 |  0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ | -0.001 | -0.198 |  0.214 |  0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  0.979 |  0.896 |  1.076 |  0.053 | torch.Size([30]) || stage5.reshape.1.weight
+ | -0.005 | -0.074 |  0.100 |  0.043 | torch.Size([30]) || stage5.reshape.1.bias
+ |  0.000 | -0.240 |  0.249 |  0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ | -0.002 | -0.286 |  0.229 |  0.080 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.001 |  0.993 |  1.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ | -0.004 | -0.018 |  0.006 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.066 |  0.062 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.086 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.012 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.166 |  0.172 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.053 |  0.045 |  0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.090 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.999 |  0.987 |  1.001 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.094 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.022 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.082 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.014 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.075 |  0.083 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.073 |  0.078 |  0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.994 |  1.007 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ | -0.004 | -0.016 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.065 |  0.063 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.077 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.022 |  0.017 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.113 |  0.098 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.058 |  0.045 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.080 |  0.080 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.008 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.999 |  0.982 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.006 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.076 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.017 |  0.014 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.080 |  0.086 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.014 |  0.016 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.096 |  0.079 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.039 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.009 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ | -0.004 | -0.014 |  0.003 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.067 |  0.073 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.085 |  0.087 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.015 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.108 |  0.095 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ | -0.001 | -0.043 |  0.039 |  0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.088 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.009 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.999 |  0.978 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.003 |  0.004 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.076 |  0.081 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.012 |  0.019 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.079 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.014 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.082 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.000 | -0.047 |  0.043 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.978 |  1.015 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ | -0.004 | -0.013 |  0.004 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.084 |  0.070 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.078 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.123 |  0.132 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.001 | -0.028 |  0.044 |  0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.082 |  0.089 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.999 |  0.974 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.008 |  0.010 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.088 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.000 | -0.014 |  0.019 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.031 |  0.020 |  0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.106 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.046 |  0.042 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.003 |  0.944 |  1.017 |  0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ | -0.005 | -0.015 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.071 |  0.067 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.085 |  0.090 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.021 |  0.013 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.130 |  0.089 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.036 |  0.024 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.086 |  0.076 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.999 |  0.967 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.006 |  0.007 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.080 |  0.085 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.001 | -0.015 |  0.010 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.000 | -0.020 |  0.018 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.037 |  0.050 |  0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.004 |  0.976 |  1.039 |  0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ | -0.005 | -0.015 |  0.005 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.070 |  0.076 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.099 |  0.097 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.011 |  0.012 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.084 |  0.093 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.038 |  0.035 |  0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.087 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.010 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.998 |  0.960 |  1.002 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.088 |  0.095 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.000 | -0.014 |  0.027 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.074 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.025 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.100 |  0.086 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.030 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.102 |  0.117 |  0.023 | torch.Size([120, 120]) || stage5.linear1.weight
+ | -0.003 | -0.297 |  0.242 |  0.084 | torch.Size([120]) || stage5.linear1.bias
+ |  0.999 |  0.971 |  1.008 |  0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ | -0.000 | -0.035 |  0.034 |  0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.079 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.087 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.028 |  0.018 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.079 |  0.082 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.146 |  0.171 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  0.997 |  0.967 |  1.003 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.073 |  0.089 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.017 |  0.008 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.084 |  0.073 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.011 |  0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.103 |  0.140 |  0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.999 |  0.986 |  1.010 |  0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.035 |  0.034 |  0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.087 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.084 |  0.079 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.024 |  0.024 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.078 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.112 |  0.144 |  0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  0.998 |  0.965 |  1.004 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 | -0.004 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.088 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.012 |  0.015 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.102 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.012 |  0.009 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.075 |  0.078 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.105 |  0.131 |  0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.220 |  0.209 |  0.035 | torch.Size([120, 120]) || stage5.linear2.weight
+ | -0.003 | -0.335 |  0.284 |  0.096 | torch.Size([120]) || stage5.linear2.bias
+ | -0.000 | -0.064 |  0.065 |  0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.001 | -0.050 |  0.050 |  0.029 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.119 |  0.106 |  0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ | -0.006 | -0.030 |  0.026 |  0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.055 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.033 |  0.031 |  0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ |  0.001 | -0.060 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ | -0.005 | -0.040 |  0.037 |  0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.038 |  0.051 |  0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.048 |  0.050 |  0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.334 |  0.340 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.037 | -0.050 |  0.294 |  0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ | -0.000 | -0.343 |  0.349 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ | -0.001 | -0.237 |  0.244 |  0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ | -0.000 | -0.575 |  0.591 |  0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.001 | -0.404 |  0.344 |  0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.254 |  1.058 |  1.466 |  0.126 | torch.Size([30]) || stage6.reshape.1.weight
+ | -0.001 | -0.074 |  0.093 |  0.041 | torch.Size([30]) || stage6.reshape.1.bias
+ |  0.000 | -0.734 |  0.625 |  0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ |  0.003 | -0.269 |  0.341 |  0.108 | torch.Size([120]) || stage6.reshape.2.bias
+ |  0.815 |  0.495 |  1.118 |  0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ | -0.071 | -0.291 |  0.263 |  0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.080 |  0.087 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.136 |  0.134 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.061 |  0.037 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.201 |  0.182 |  0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.223 |  0.189 |  0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.184 |  0.211 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.049 |  0.069 |  0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.710 |  0.556 |  0.893 |  0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ | -0.003 | -0.172 |  0.193 |  0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.217 |  0.211 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.041 | -0.158 |  0.025 |  0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.209 |  0.178 |  0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.141 |  0.186 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.245 |  0.347 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.005 | -0.161 |  0.188 |  0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.780 |  0.582 |  0.963 |  0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ | -0.112 | -0.302 |  0.103 |  0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.101 |  0.072 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.112 |  0.178 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.034 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.223 |  0.242 |  0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.149 |  0.105 |  0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.199 |  0.173 |  0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.000 | -0.035 |  0.056 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.744 |  0.530 |  0.917 |  0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.004 | -0.131 |  0.180 |  0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.243 |  0.294 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.039 | -0.217 |  0.045 |  0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.206 |  0.178 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.129 |  0.125 |  0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.236 |  0.276 |  0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.158 |  0.170 |  0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.829 |  0.586 |  1.007 |  0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ | -0.101 | -0.353 |  0.132 |  0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.082 |  0.076 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.154 |  0.143 |  0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.041 |  0.038 |  0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.187 |  0.202 |  0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.096 |  0.127 |  0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.203 |  0.185 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.768 |  0.491 |  0.904 |  0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.001 | -0.146 |  0.159 |  0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.184 |  0.204 |  0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.043 | -0.185 |  0.020 |  0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.188 |  0.270 |  0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.152 |  0.134 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.222 |  0.217 |  0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.141 |  0.144 |  0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.820 |  0.554 |  0.976 |  0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ | -0.091 | -0.336 |  0.137 |  0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.124 |  0.222 |  0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.157 |  0.175 |  0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.049 |  0.049 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.238 |  0.236 |  0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.077 |  0.074 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.212 |  0.265 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.028 |  0.052 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.768 |  0.530 |  0.903 |  0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.002 | -0.104 |  0.157 |  0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.197 |  0.220 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.042 | -0.155 |  0.043 |  0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.166 |  0.199 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.102 |  0.138 |  0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.241 |  0.256 |  0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.123 |  0.115 |  0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.817 |  0.631 |  0.918 |  0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ | -0.082 | -0.295 |  0.141 |  0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.084 |  0.205 |  0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.174 |  0.199 |  0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.060 |  0.081 |  0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.194 |  0.191 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.001 | -0.083 |  0.077 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.218 |  0.243 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.031 |  0.024 |  0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.744 |  0.478 |  0.913 |  0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ | -0.003 | -0.146 |  0.110 |  0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.223 |  0.238 |  0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.046 | -0.200 |  0.071 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.168 |  0.201 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.128 |  0.141 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.220 |  0.205 |  0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.086 |  0.094 |  0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.754 |  0.353 |  0.933 |  0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ | -0.058 | -0.246 |  0.105 |  0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.113 |  0.536 |  0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.261 |  0.224 |  0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.002 | -0.050 |  0.067 |  0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.234 |  0.256 |  0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ |  0.002 | -0.079 |  0.076 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.211 |  0.231 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.033 |  0.030 |  0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.677 |  0.275 |  0.833 |  0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.001 | -0.224 |  0.306 |  0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.196 |  0.211 |  0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.061 | -0.289 |  0.136 |  0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.271 |  0.312 |  0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.003 | -0.166 |  0.155 |  0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.286 |  0.375 |  0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.005 | -0.054 |  0.137 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.174 |  0.172 |  0.039 | torch.Size([120, 120]) || stage6.linear1.weight
+ |  0.002 | -0.275 |  0.348 |  0.113 | torch.Size([120]) || stage6.linear1.bias
+ |  0.704 |  0.402 |  1.002 |  0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.001 | -0.466 |  0.407 |  0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.172 |  0.570 |  0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.337 |  0.378 |  0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.071 |  0.068 |  0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.290 |  0.321 |  0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ |  0.001 | -0.255 |  0.250 |  0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  0.695 |  0.353 |  0.966 |  0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ | -0.001 | -0.218 |  0.165 |  0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.259 |  0.255 |  0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.044 | -0.256 |  0.042 |  0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.234 |  0.214 |  0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.133 |  0.091 |  0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.333 |  0.296 |  0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.238 |  0.280 |  0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.671 |  0.425 |  0.980 |  0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.001 | -0.261 |  0.305 |  0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.372 |  0.942 |  0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.450 |  0.494 |  0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.133 |  0.119 |  0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.239 |  0.288 |  0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.187 |  0.157 |  0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  0.687 |  0.160 |  0.907 |  0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ | -0.002 | -0.192 |  0.222 |  0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.257 |  0.426 |  0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.064 | -0.207 |  0.036 |  0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.269 |  0.224 |  0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.126 |  0.129 |  0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.308 |  0.298 |  0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.180 |  0.192 |  0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.297 |  0.368 |  0.069 | torch.Size([120, 120]) || stage6.linear2.weight
+ |  0.001 | -0.431 |  0.480 |  0.189 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.100 |  0.104 |  0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.001 | -0.018 |  0.029 |  0.010 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.105 |  0.111 |  0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ | -0.007 | -0.033 |  0.024 |  0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.071 |  0.067 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.003 | -0.061 |  0.043 |  0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.074 |  0.068 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.075 |  0.056 |  0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.124 |  0.108 |  0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ | -0.001 | -0.113 |  0.076 |  0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.001 | -0.517 |  0.524 |  0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ |  0.154 | -0.305 |  0.679 |  0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ |  0.000 | -0.680 |  0.728 |  0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ |  0.020 | -0.514 |  0.417 |  0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.587 |  0.737 |  0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.015 | -0.437 |  0.490 |  0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.284 |  1.119 |  1.404 |  0.055 | torch.Size([30]) || stage7.reshape.1.weight
+ | -0.014 | -0.286 |  0.184 |  0.122 | torch.Size([30]) || stage7.reshape.1.bias
+ | -0.000 | -0.521 |  0.576 |  0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ |  0.004 | -0.387 |  0.738 |  0.175 | torch.Size([120]) || stage7.reshape.2.bias
+ |  0.440 |  0.099 |  0.775 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ | -0.177 | -0.670 |  0.319 |  0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.055 | -2.159 |  1.979 |  0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.535 |  0.554 |  0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.193 |  0.281 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.397 |  0.395 |  0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.232 |  0.692 |  0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.899 |  1.073 |  0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.122 |  0.104 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.310 |  0.157 |  0.440 |  0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.006 | -0.474 |  0.266 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.605 |  0.490 |  0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.101 | -0.310 |  0.126 |  0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.448 |  0.475 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.006 | -0.185 |  0.215 |  0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.465 |  0.512 |  0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.150 |  0.417 |  0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.577 |  0.165 |  0.829 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ | -0.136 | -0.849 |  0.206 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.143 | -3.020 |  4.621 |  0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.647 |  0.640 |  0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.356 |  0.382 |  0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.457 |  0.378 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.250 |  0.707 |  0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ | -0.001 | -1.055 |  1.091 |  0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.093 |  0.123 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.411 |  0.265 |  0.535 |  0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.008 | -0.630 |  0.264 |  0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.501 |  0.506 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.087 | -0.341 |  0.140 |  0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.450 |  0.527 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.188 |  0.171 |  0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.554 |  0.546 |  0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.135 |  0.220 |  0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.655 |  0.134 |  0.896 |  0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ | -0.139 | -0.788 |  0.181 |  0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ | -0.062 | -3.469 |  3.276 |  0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.592 |  0.650 |  0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.308 |  0.218 |  0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.355 |  0.345 |  0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.213 |  0.700 |  0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -1.166 |  0.942 |  0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.106 |  0.093 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.466 |  0.317 |  0.565 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.014 | -0.657 |  0.280 |  0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.541 |  0.494 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.079 | -0.335 |  0.122 |  0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.513 |  0.493 |  0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.180 |  0.175 |  0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.509 |  0.479 |  0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.004 | -0.093 |  0.293 |  0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.693 |  0.147 |  0.945 |  0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ | -0.132 | -0.906 |  0.249 |  0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.108 | -3.576 |  4.241 |  0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.945 |  1.095 |  0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.274 |  0.204 |  0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.379 |  0.351 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.211 |  0.587 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -1.269 |  1.067 |  0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.117 |  0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.499 |  0.285 |  0.570 |  0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.012 | -0.567 |  0.273 |  0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.528 |  0.499 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.084 | -0.349 |  0.141 |  0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.547 |  0.592 |  0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.154 |  0.176 |  0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.001 | -0.520 |  0.480 |  0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.150 |  0.207 |  0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.726 |  0.137 |  1.004 |  0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ | -0.122 | -0.907 |  0.180 |  0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ | -0.078 | -3.824 |  4.241 |  0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -1.188 |  0.796 |  0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.002 | -0.248 |  0.207 |  0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.001 | -0.409 |  0.369 |  0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ |  0.002 | -0.224 |  0.322 |  0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -1.744 |  1.273 |  0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.092 |  0.113 |  0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.514 |  0.277 |  0.614 |  0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.016 | -0.621 |  0.286 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.517 |  0.453 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.064 | -0.260 |  0.143 |  0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.554 |  0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.004 | -0.232 |  0.193 |  0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.595 |  0.543 |  0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.196 |  0.198 |  0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.731 |  0.152 |  1.075 |  0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ | -0.076 | -1.003 |  0.176 |  0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ | -0.121 | -3.281 |  4.671 |  0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.640 |  1.083 |  0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.001 | -0.239 |  0.314 |  0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.001 | -0.344 |  0.452 |  0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.004 | -0.361 |  0.251 |  0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.637 |  0.806 |  0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.088 |  0.091 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.514 |  0.238 |  0.594 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.017 | -0.650 |  0.162 |  0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.442 |  0.479 |  0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.040 | -0.400 |  0.203 |  0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.541 |  0.514 |  0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.008 | -0.319 |  0.309 |  0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -1.018 |  1.398 |  0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.606 |  0.269 |  0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.186 |  0.207 |  0.048 | torch.Size([120, 120]) || stage7.linear1.weight
+ |  0.010 | -0.448 |  0.437 |  0.161 | torch.Size([120]) || stage7.linear1.bias
+ |  0.703 |  0.381 |  0.856 |  0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.014 | -0.645 |  0.486 |  0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ | -0.007 | -4.468 |  1.008 |  0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.625 |  0.834 |  0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.009 | -0.737 |  0.632 |  0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.403 |  0.406 |  0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ | -0.007 | -0.338 |  0.165 |  0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  0.435 |  0.323 |  0.526 |  0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.005 | -0.678 |  0.379 |  0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.465 |  0.467 |  0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.031 | -0.236 |  0.180 |  0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.490 |  0.520 |  0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.197 |  0.242 |  0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.525 |  0.501 |  0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.431 |  0.164 |  0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.703 |  0.306 |  0.866 |  0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.009 | -0.647 |  0.481 |  0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ | -0.010 | -3.504 |  1.842 |  0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.639 |  0.590 |  0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.613 |  0.609 |  0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.316 |  0.325 |  0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ | -0.004 | -0.350 |  0.145 |  0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  0.452 |  0.309 |  0.558 |  0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.003 | -0.661 |  0.246 |  0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.580 |  0.410 |  0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.020 | -0.258 |  0.299 |  0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.561 |  0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.234 |  0.434 |  0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.778 |  0.581 |  0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.888 |  0.286 |  0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.348 |  0.237 |  0.060 | torch.Size([120, 120]) || stage7.linear2.weight
+ |  0.023 | -0.390 |  0.506 |  0.167 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.104 |  0.107 |  0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.002 | -0.041 |  0.035 |  0.016 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.123 |  0.109 |  0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.034 |  0.032 |  0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.111 |  0.084 |  0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.073 |  0.081 |  0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ | -0.002 | -0.154 |  0.122 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.014 | -0.041 |  0.068 |  0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ | -0.001 | -0.408 |  0.365 |  0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.057 |  0.054 |  0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.697 |  0.606 |  0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.119 | -0.211 |  0.720 |  0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ |  0.000 | -1.175 |  0.924 |  0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ | -0.000 | -0.581 |  0.580 |  0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ |  0.001 | -0.786 |  0.874 |  0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ | -0.053 | -0.522 |  0.577 |  0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.225 |  1.000 |  1.516 |  0.095 | torch.Size([120]) || stage8.0.1.weight
+ | -0.013 | -0.413 |  0.465 |  0.139 | torch.Size([120]) || stage8.0.1.bias
+ |  0.000 | -2.505 |  0.627 |  0.136 | torch.Size([180, 120]) || stage8.0.2.weight
+ |  0.005 | -0.397 |  0.377 |  0.107 | torch.Size([180]) || stage8.0.2.bias
+ |  0.456 |  0.123 |  0.760 |  0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ | -0.022 | -0.343 |  0.875 |  0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ | -0.014 | -1.907 |  2.592 |  0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.632 |  0.628 |  0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.006 | -0.567 |  0.668 |  0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.477 |  0.447 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ | -0.010 | -0.460 |  0.225 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  0.429 |  0.119 |  0.634 |  0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ | -0.007 | -0.338 |  0.803 |  0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.572 |  0.539 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ | -0.060 | -0.260 |  0.185 |  0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.461 |  0.548 |  0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.163 |  0.183 |  0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.757 |  0.581 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.191 |  0.121 |  0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  0.557 |  0.086 |  0.800 |  0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ | -0.029 | -0.230 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ | -0.016 | -2.004 |  1.711 |  0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.690 |  0.575 |  0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.011 | -0.641 |  0.609 |  0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.466 |  0.401 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ | -0.008 | -0.344 |  0.181 |  0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  0.503 |  0.226 |  0.742 |  0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.404 |  0.818 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.007 | -0.595 |  0.532 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ | -0.068 | -0.261 |  0.071 |  0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.573 |  0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.129 |  0.197 |  0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.556 |  0.582 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.170 |  0.145 |  0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  0.699 |  0.202 |  0.912 |  0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.253 |  0.924 |  0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ | -0.030 | -2.510 |  2.088 |  0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.637 |  0.801 |  0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.006 | -0.512 |  0.520 |  0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.381 |  0.337 |  0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ | -0.011 | -0.238 |  0.234 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  0.594 |  0.150 |  0.810 |  0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ | -0.010 | -0.483 |  0.726 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.567 |  0.499 |  0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ | -0.077 | -0.360 |  0.050 |  0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.536 |  0.673 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.142 |  0.186 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.536 |  0.524 |  0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ | -0.006 | -0.147 |  0.133 |  0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  0.683 |  0.141 |  0.908 |  0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.199 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ | -0.039 | -1.527 |  3.891 |  0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.682 |  0.693 |  0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.007 | -0.543 |  0.513 |  0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.390 |  0.476 |  0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ | -0.007 | -0.176 |  0.150 |  0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  0.640 |  0.094 |  0.853 |  0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ | -0.009 | -0.372 |  0.683 |  0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.006 | -0.628 |  0.521 |  0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ | -0.089 | -0.367 |  0.047 |  0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.629 |  0.562 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.186 |  0.128 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.485 |  0.499 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.138 |  0.209 |  0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.294 |  0.577 |  0.071 | torch.Size([180, 180]) || stage8.1.linear.weight
+ |  0.004 | -0.349 |  0.235 |  0.072 | torch.Size([180]) || stage8.1.linear.bias
+ |  0.708 |  0.242 |  1.026 |  0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ | -0.032 | -0.212 |  0.830 |  0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ | -0.039 | -1.954 |  2.394 |  0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.922 |  0.646 |  0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.429 |  0.524 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.467 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ | -0.005 | -0.339 |  0.264 |  0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  0.587 |  0.255 |  0.837 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ | -0.011 | -0.285 |  0.721 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.586 |  0.534 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.075 | -0.225 |  0.066 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.493 |  0.532 |  0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.189 |  0.178 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.551 |  0.543 |  0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.010 | -0.154 |  0.142 |  0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  0.773 |  0.210 |  1.004 |  0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ | -0.035 | -0.176 |  0.873 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ | -0.027 | -2.407 |  1.736 |  0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.817 |  0.977 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.659 |  0.461 |  0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.484 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ | -0.014 | -0.315 |  0.252 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  0.641 |  0.337 |  0.810 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ | -0.011 | -0.177 |  0.806 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ | -0.006 | -0.569 |  0.598 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.079 | -0.323 |  0.071 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.512 |  0.577 |  0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.142 |  0.161 |  0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.529 |  0.572 |  0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ | -0.010 | -0.178 |  0.159 |  0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  0.857 |  0.199 |  1.153 |  0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ | -0.039 | -0.189 |  0.943 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.042 | -1.962 |  2.773 |  0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.783 |  0.655 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.004 | -0.338 |  0.533 |  0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.497 |  0.461 |  0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ | -0.008 | -0.288 |  0.183 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  0.681 |  0.327 |  0.878 |  0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ | -0.012 | -0.178 |  0.773 |  0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.789 |  0.546 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ | -0.081 | -0.249 |  0.036 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.526 |  0.555 |  0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.133 |  0.191 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.572 |  0.529 |  0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.164 |  0.147 |  0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  0.877 |  0.198 |  1.043 |  0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ | -0.038 | -0.210 |  0.916 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ | -0.094 | -2.974 |  4.987 |  0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.964 |  1.011 |  0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.404 |  0.429 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.501 |  0.489 |  0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ | -0.021 | -0.305 |  0.208 |  0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  0.697 |  0.295 |  0.894 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ | -0.015 | -0.241 |  0.712 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.562 |  0.573 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ | -0.085 | -0.302 |  0.080 |  0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.734 |  0.573 |  0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.150 |  0.161 |  0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.671 |  0.623 |  0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ | -0.023 | -0.252 |  0.317 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.278 |  0.345 |  0.064 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.004 | -0.315 |  0.148 |  0.064 | torch.Size([180]) || stage8.2.linear.bias
+ |  0.850 |  0.326 |  1.087 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ | -0.031 | -0.334 |  0.779 |  0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.012 | -2.917 |  1.476 |  0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.603 |  0.666 |  0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.374 |  0.381 |  0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.577 |  0.605 |  0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ | -0.008 | -0.394 |  0.499 |  0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  0.636 |  0.321 |  0.790 |  0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ | -0.013 | -0.294 |  0.774 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ | -0.004 | -0.540 |  0.539 |  0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ | -0.065 | -0.212 |  0.047 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.608 |  0.603 |  0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.177 |  0.155 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.573 |  0.630 |  0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.189 |  0.178 |  0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  0.899 |  0.275 |  1.048 |  0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ | -0.031 | -0.223 |  0.771 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ | -0.003 | -3.151 |  1.718 |  0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.732 |  0.868 |  0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.412 |  0.350 |  0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.466 |  0.487 |  0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ | -0.006 | -0.388 |  0.400 |  0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  0.711 |  0.381 |  0.864 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.240 |  0.692 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ | -0.005 | -0.657 |  0.639 |  0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.263 |  0.047 |  0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.673 |  0.605 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.158 |  0.155 |  0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.582 |  0.585 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.009 | -0.253 |  0.178 |  0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  0.941 |  0.262 |  1.154 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ | -0.032 | -0.162 |  0.906 |  0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ | -0.005 | -3.421 |  1.350 |  0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.777 |  0.735 |  0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.355 |  0.421 |  0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.479 |  0.475 |  0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ | -0.013 | -0.292 |  0.345 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  0.743 |  0.242 |  0.919 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ | -0.011 | -0.214 |  0.691 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.005 | -0.633 |  0.498 |  0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ | -0.082 | -0.346 |  0.087 |  0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.591 |  0.670 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.190 |  0.151 |  0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.560 |  0.637 |  0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ | -0.009 | -0.226 |  0.250 |  0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  0.950 |  0.250 |  1.103 |  0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.196 |  0.925 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ | -0.026 | -3.591 |  5.653 |  0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.753 |  0.637 |  0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.333 |  0.432 |  0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.001 | -0.591 |  0.591 |  0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ | -0.014 | -0.348 |  0.267 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  0.735 |  0.254 |  0.893 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ | -0.011 | -0.241 |  0.659 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.628 |  0.667 |  0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ | -0.076 | -0.411 |  0.113 |  0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.662 |  0.578 |  0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ | -0.004 | -0.208 |  0.169 |  0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.602 |  0.588 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ | -0.011 | -0.218 |  0.232 |  0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.343 |  0.316 |  0.065 | torch.Size([180, 180]) || stage8.3.linear.weight
+ |  0.010 | -0.297 |  0.187 |  0.061 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.012 |  0.330 |  1.282 |  0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ | -0.030 | -0.347 |  0.800 |  0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ | -0.013 | -2.816 |  3.792 |  0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.807 |  0.825 |  0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.429 |  0.319 |  0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.553 |  0.569 |  0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ | -0.019 | -0.443 |  0.441 |  0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  0.638 |  0.420 |  0.797 |  0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ | -0.018 | -0.222 |  0.886 |  0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.576 |  0.510 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ | -0.018 | -0.277 |  0.123 |  0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.687 |  0.625 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ | -0.007 | -0.264 |  0.267 |  0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.639 |  0.705 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.012 | -0.255 |  0.274 |  0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.092 |  0.475 |  1.341 |  0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ | -0.030 | -0.294 |  0.686 |  0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.018 | -3.165 |  0.990 |  0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.695 |  0.699 |  0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.319 |  0.286 |  0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.542 |  0.519 |  0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ | -0.017 | -0.439 |  0.451 |  0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  0.664 |  0.366 |  0.835 |  0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ | -0.015 | -0.217 |  0.985 |  0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.641 |  0.563 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ | -0.022 | -0.381 |  0.161 |  0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.571 |  0.642 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.279 |  0.311 |  0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.738 |  0.633 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ | -0.007 | -0.254 |  0.261 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.125 |  0.525 |  1.405 |  0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.186 |  0.627 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ |  0.028 | -3.477 |  0.957 |  0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.663 |  0.658 |  0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.007 | -0.357 |  0.255 |  0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.596 |  0.578 |  0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ | -0.018 | -0.506 |  0.389 |  0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  0.694 |  0.319 |  0.865 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ | -0.018 | -0.150 |  0.975 |  0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.619 |  0.565 |  0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ | -0.025 | -0.345 |  0.208 |  0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.624 |  0.607 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.388 |  0.290 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.927 |  0.675 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.325 |  0.240 |  0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.108 |  0.535 |  1.297 |  0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.213 |  0.546 |  0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ |  0.020 | -3.042 |  1.420 |  0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.697 |  0.700 |  0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.311 |  0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.652 |  0.592 |  0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ | -0.019 | -0.535 |  0.426 |  0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  0.685 |  0.225 |  0.893 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.211 |  0.938 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ | -0.001 | -0.501 |  0.564 |  0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ | -0.014 | -0.339 |  0.237 |  0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.560 |  0.626 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.231 |  0.239 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.544 |  0.657 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.271 |  0.274 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.473 |  0.481 |  0.069 | torch.Size([180, 180]) || stage8.4.linear.weight
+ |  0.029 | -0.333 |  0.194 |  0.076 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.025 |  0.297 |  1.336 |  0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ | -0.034 | -0.429 |  0.872 |  0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.574 | -4.515 |  3.381 |  0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.771 |  0.886 |  0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.356 |  0.521 |  0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.632 |  0.656 |  0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ | -0.029 | -0.329 |  0.697 |  0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  0.777 |  0.446 |  0.952 |  0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ | -0.022 | -0.335 |  0.920 |  0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.520 |  0.598 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.013 | -0.456 |  0.200 |  0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.677 |  0.642 |  0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.272 |  0.233 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.762 |  0.598 |  0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.025 | -0.244 |  0.583 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.021 |  0.261 |  1.261 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ | -0.033 | -0.358 |  0.867 |  0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ | -0.550 | -3.274 |  4.406 |  0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.819 |  0.986 |  0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.005 | -0.510 |  0.446 |  0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.003 | -0.739 |  0.682 |  0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ | -0.032 | -0.318 |  0.607 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  0.823 |  0.420 |  0.950 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ | -0.021 | -0.274 |  0.882 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.496 |  0.532 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ | -0.028 | -0.260 |  0.194 |  0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.620 |  0.586 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.284 |  0.423 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.774 |  0.614 |  0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ | -0.028 | -0.371 |  0.561 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.096 |  0.377 |  1.321 |  0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.244 |  0.755 |  0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.441 | -3.439 |  5.870 |  0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.710 |  0.679 |  0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.277 |  0.283 |  0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.001 | -0.824 |  0.684 |  0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.033 | -0.390 |  0.545 |  0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  0.843 |  0.390 |  0.984 |  0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ | -0.022 | -0.211 |  0.854 |  0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.522 |  0.503 |  0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ | -0.024 | -0.243 |  0.219 |  0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ | -0.001 | -0.638 |  0.617 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.268 |  0.380 |  0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.713 |  0.769 |  0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ | -0.034 | -0.372 |  0.592 |  0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.027 |  0.318 |  1.206 |  0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.187 |  0.768 |  0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ | -0.347 | -2.664 |  2.684 |  0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.677 |  0.676 |  0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.002 | -0.410 |  0.354 |  0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.630 |  0.725 |  0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ | -0.041 | -0.385 |  0.660 |  0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  0.849 |  0.390 |  0.985 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.163 |  0.810 |  0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ | -0.002 | -0.547 |  0.536 |  0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ | -0.012 | -0.366 |  0.252 |  0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.669 |  0.597 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.216 |  0.202 |  0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.700 |  0.674 |  0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ | -0.032 | -0.376 |  0.666 |  0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.299 |  0.469 |  0.069 | torch.Size([180, 180]) || stage8.5.linear.weight
+ |  0.081 | -0.562 |  0.263 |  0.109 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.111 |  0.208 |  1.434 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ | -0.048 | -0.547 |  0.851 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ | -0.252 | -2.157 |  6.293 |  0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.664 |  0.631 |  0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.007 | -0.293 |  0.366 |  0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.701 |  0.726 |  0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ |  0.030 | -0.318 |  0.331 |  0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  0.959 |  0.475 |  1.322 |  0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ | -0.039 | -0.421 |  0.873 |  0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.550 |  0.783 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.269 |  0.152 |  0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.914 |  0.839 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.340 |  0.304 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.592 |  0.713 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.535 |  0.384 |  0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.123 |  0.183 |  1.352 |  0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ | -0.047 | -0.513 |  0.903 |  0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ | -0.234 | -1.968 |  6.366 |  0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.751 |  0.759 |  0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.300 |  0.214 |  0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.657 |  0.699 |  0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ |  0.031 | -0.321 |  0.293 |  0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  0.986 |  0.416 |  1.360 |  0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ | -0.038 | -0.393 |  0.807 |  0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ | -0.001 | -0.589 |  0.620 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.316 |  0.229 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.738 |  0.766 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.252 |  0.302 |  0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.674 |  0.629 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.475 |  0.441 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.097 |  0.342 |  1.294 |  0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ | -0.054 | -0.639 |  0.904 |  0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ | -0.135 | -3.252 |  1.238 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.672 |  0.663 |  0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.007 | -0.170 |  0.228 |  0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.001 | -0.660 |  0.651 |  0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.031 | -0.360 |  0.322 |  0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.004 |  0.360 |  1.381 |  0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ | -0.042 | -0.447 |  0.808 |  0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.600 |  0.603 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.022 | -0.447 |  0.249 |  0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.666 |  0.708 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.326 |  0.272 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.653 |  0.719 |  0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.488 |  0.321 |  0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.095 |  0.272 |  1.302 |  0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ | -0.052 | -0.557 |  1.069 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ | -0.196 | -2.349 |  1.401 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.741 |  0.657 |  0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.186 |  0.141 |  0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.669 |  0.671 |  0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.004 | -0.323 |  0.300 |  0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  0.999 |  0.383 |  1.380 |  0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ | -0.044 | -0.392 |  0.694 |  0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.577 |  0.857 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.041 | -0.394 |  0.238 |  0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.924 |  0.828 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.214 |  0.407 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.827 |  0.755 |  0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.022 | -0.296 |  0.262 |  0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ |  0.002 | -1.059 |  1.262 |  0.089 | torch.Size([180, 180]) || stage8.6.linear.weight
+ |  0.031 | -0.789 |  0.427 |  0.120 | torch.Size([180]) || stage8.6.linear.bias
+ |  0.389 |  0.079 |  1.137 |  0.176 | torch.Size([180]) || norm.weight
+ | -0.021 | -0.669 |  0.888 |  0.127 | torch.Size([180]) || norm.bias
+ |  0.000 | -0.486 |  0.568 |  0.103 | torch.Size([120, 180]) || conv_after_body.weight
+ | -0.000 | -0.167 |  0.168 |  0.055 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -1.782 |  1.300 |  0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ | -0.019 | -0.542 |  0.437 |  0.162 | torch.Size([64]) || conv_before_upsample.0.bias
+ |  0.001 | -1.915 |  1.372 |  0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ | -0.045 | -0.281 |  0.215 |  0.097 | torch.Size([256]) || upsample.0.bias
+ | -0.006 | -4.826 |  0.582 |  0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ | -0.154 | -0.441 |  0.187 |  0.100 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.210 |  0.246 |  0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.000 | -0.013 |  0.007 |  0.003 | torch.Size([64]) || upsample.10.bias
+ |  0.000 | -0.044 |  0.042 |  0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([3]) || conv_last.bias
+
+22-03-11 10:52:19.525 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/train/train_sharp
+      dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 6
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:52:19.571 : Number of train images: 24,000, iters: 3,000
+22-03-11 10:52:33.932 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 10:52:34.115 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ | -0.000 | -1.462 |  1.580 |  0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ |  0.005 | -0.950 |  0.885 |  0.268 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.679 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.042 | -0.894 |  0.351 |  0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.008 | -3.201 |  0.948 |  0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.059 | -1.268 |  0.732 |  0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.633 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.159 | -0.704 |  0.859 |  0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.024 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.780 | -1.061 |  1.162 |  0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.144 |  0.163 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ |  0.001 | -0.003 |  0.005 |  0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ |  0.000 | -0.726 |  0.773 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.021 | -0.814 |  0.355 |  0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.010 | -3.380 |  0.916 |  0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.038 | -1.207 |  0.714 |  0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.462 |  0.549 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.157 | -0.742 |  0.980 |  0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.020 | -1.648 |  0.319 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.775 | -1.195 |  1.148 |  0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ | -0.000 | -0.122 |  0.152 |  0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ | -0.000 | -0.002 |  0.001 |  0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -0.956 |  0.870 |  0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.025 | -1.040 |  0.512 |  0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.011 | -4.624 |  1.195 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.023 | -1.284 |  0.699 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.009 | -1.831 |  0.616 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.120 | -0.695 |  0.755 |  0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.013 | -1.285 |  0.304 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.681 | -1.725 |  0.942 |  0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.045 |  0.071 |  0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.010 | -0.010 | -0.009 |  0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ | -0.000 | -0.995 |  0.879 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.040 | -1.137 |  0.617 |  0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.010 | -4.891 |  1.224 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.022 | -1.287 |  0.745 |  0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.010 | -1.802 |  0.561 |  0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.118 | -0.694 |  0.697 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.012 | -1.107 |  0.306 |  0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.658 | -1.792 |  0.905 |  0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.030 |  0.037 |  0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.003 | -0.001 |  0.007 |  0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -0.990 |  0.880 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ | -0.010 | -1.067 |  0.596 |  0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.010 | -5.061 |  1.229 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.024 | -1.274 |  0.830 |  0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.009 | -1.787 |  0.563 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.130 | -0.685 |  0.743 |  0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.011 | -0.973 |  0.292 |  0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.659 | -1.855 |  0.931 |  0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.034 |  0.040 |  0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.001 | -0.009 |  0.007 |  0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -0.973 |  0.853 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.022 | -1.001 |  0.571 |  0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.009 | -5.095 |  1.251 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.026 | -1.305 |  0.880 |  0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.815 |  0.561 |  0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.137 | -0.711 |  0.771 |  0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.010 | -0.986 |  0.286 |  0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.671 | -1.913 |  0.966 |  0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.034 |  0.028 |  0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ |  0.002 | -0.013 |  0.016 |  0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.280 |  0.669 |  1.862 |  0.274 | torch.Size([120]) || stage1.reshape.1.weight
+ | -0.006 | -0.324 |  0.337 |  0.106 | torch.Size([120]) || stage1.reshape.1.bias
+ |  0.579 |  0.129 |  1.064 |  0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ | -0.039 | -1.100 |  0.894 |  0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ | -0.134 | -4.020 |  2.585 |  0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.579 |  0.618 |  0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.319 |  0.279 |  0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.634 |  0.686 |  0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ | -0.014 | -0.222 |  0.642 |  0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -1.066 |  0.928 |  0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.146 |  0.190 |  0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.781 |  0.367 |  1.203 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.029 | -0.378 |  0.545 |  0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.687 |  0.753 |  0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.010 | -0.229 |  0.633 |  0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.674 |  0.669 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.011 | -0.448 |  0.368 |  0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.862 |  0.941 |  0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.267 |  0.594 |  0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.797 |  0.211 |  1.475 |  0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ | -0.161 | -1.941 |  0.746 |  0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ | -0.296 | -3.927 |  2.840 |  0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.001 | -1.479 |  1.395 |  0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.381 |  0.258 |  0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.526 |  0.561 |  0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.178 |  0.478 |  0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -1.242 |  1.138 |  0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.004 | -0.213 |  0.196 |  0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.702 |  0.349 |  0.904 |  0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.039 | -0.646 |  0.384 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.872 |  0.750 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.049 | -0.353 |  0.135 |  0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.562 |  0.580 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.238 |  0.457 |  0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.828 |  0.685 |  0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.031 | -0.297 |  0.419 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.984 |  0.163 |  1.398 |  0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ | -0.167 | -1.609 |  0.367 |  0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.343 | -4.484 |  2.362 |  0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -1.586 |  1.649 |  0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.240 |  0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.378 |  0.514 |  0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ | -0.009 | -0.143 |  0.172 |  0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ |  0.001 | -0.639 |  0.582 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.173 |  0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.733 |  0.277 |  0.903 |  0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.038 | -0.861 |  0.359 |  0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.787 |  0.679 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.029 | -0.365 |  0.143 |  0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.574 |  0.539 |  0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.283 |  0.254 |  0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.001 | -0.998 |  0.522 |  0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.030 | -0.169 |  0.293 |  0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.035 |  0.143 |  1.397 |  0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ | -0.161 | -1.413 |  0.084 |  0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ | -0.441 | -4.685 |  3.306 |  0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -1.590 |  1.329 |  0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.266 |  0.232 |  0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.366 |  0.372 |  0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.011 | -0.225 |  0.171 |  0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.660 |  0.801 |  0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.139 |  0.200 |  0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.724 |  0.190 |  0.911 |  0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.038 | -0.981 |  0.285 |  0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.611 |  0.598 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.035 | -0.299 |  0.221 |  0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.502 |  0.520 |  0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.271 |  0.215 |  0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.558 |  0.898 |  0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.010 | -0.424 |  0.190 |  0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.085 |  0.169 |  1.400 |  0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ | -0.086 | -1.613 |  0.150 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ | -0.541 | -3.902 |  3.728 |  0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ |  0.001 | -1.879 |  1.832 |  0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.001 | -0.391 |  0.444 |  0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.407 |  0.448 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ | -0.013 | -0.302 |  0.342 |  0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ | -0.001 | -0.830 |  0.863 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.117 |  0.094 |  0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.704 |  0.195 |  0.870 |  0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.031 | -1.069 |  0.276 |  0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.656 |  0.555 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.029 | -0.387 |  0.256 |  0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.001 | -0.590 |  0.624 |  0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.011 | -0.277 |  0.303 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -1.124 |  0.539 |  0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.006 | -0.718 |  0.133 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.037 |  0.176 |  1.327 |  0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ | -0.112 | -1.591 |  0.177 |  0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ | -0.438 | -2.229 |  2.797 |  0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -2.212 |  1.826 |  0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.343 |  0.338 |  0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.367 |  0.451 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ | -0.022 | -0.358 |  0.242 |  0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ |  0.001 | -0.922 |  0.886 |  0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.002 | -0.083 |  0.089 |  0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.662 |  0.277 |  0.831 |  0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.025 | -0.959 |  0.261 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ | -0.001 | -0.636 |  0.739 |  0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.030 | -0.419 |  0.517 |  0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.615 |  0.709 |  0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.002 | -0.230 |  0.457 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.001 | -1.724 |  1.186 |  0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.019 | -1.909 |  0.255 |  0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.242 |  0.244 |  0.057 | torch.Size([120, 120]) || stage1.linear1.weight
+ |  0.004 | -0.221 |  0.224 |  0.083 | torch.Size([120]) || stage1.linear1.bias
+ |  0.737 |  0.334 |  1.046 |  0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.013 | -0.911 |  0.763 |  0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ | -0.052 | -2.462 |  2.040 |  0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.785 |  0.767 |  0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.009 | -0.466 |  0.552 |  0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.431 |  0.475 |  0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ | -0.009 | -0.796 |  0.497 |  0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  0.573 |  0.409 |  0.935 |  0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.015 | -0.828 |  0.839 |  0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.604 |  0.542 |  0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.037 | -0.179 |  0.273 |  0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.666 |  0.553 |  0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.416 |  0.396 |  0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.654 |  0.538 |  0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.470 |  0.310 |  0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.951 |  0.342 |  1.189 |  0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.010 | -0.697 |  0.802 |  0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ | -0.098 | -2.648 |  2.410 |  0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.733 |  0.886 |  0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.468 |  0.550 |  0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.435 |  0.377 |  0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.359 |  0.258 |  0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  0.582 |  0.305 |  0.717 |  0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.008 | -0.714 |  0.833 |  0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.001 | -0.732 |  0.501 |  0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.306 |  0.267 |  0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.510 |  0.533 |  0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.315 |  0.291 |  0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.736 |  0.789 |  0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -1.274 |  1.328 |  0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.390 |  0.303 |  0.069 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.010 | -0.219 |  0.227 |  0.087 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.095 |  0.106 |  0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ | -0.001 | -0.036 |  0.036 |  0.013 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.136 |  0.141 |  0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.028 |  0.024 |  0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.156 |  0.104 |  0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.055 |  0.045 |  0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.098 |  0.106 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ | -0.000 | -0.081 |  0.070 |  0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.375 |  0.279 |  0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.074 |  0.070 |  0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.776 |  0.733 |  0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ |  0.021 | -0.239 |  0.513 |  0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ |  0.001 | -1.100 |  1.143 |  0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ |  0.008 | -0.405 |  0.393 |  0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ |  0.000 | -0.963 |  0.899 |  0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ | -0.055 | -0.616 |  0.599 |  0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.149 |  0.345 |  1.921 |  0.289 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.017 | -0.502 |  0.663 |  0.141 | torch.Size([480]) || stage2.reshape.1.bias
+ | -0.000 | -0.609 |  0.736 |  0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ |  0.006 | -0.136 |  0.404 |  0.077 | torch.Size([120]) || stage2.reshape.2.bias
+ |  0.686 |  0.172 |  1.113 |  0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ | -0.154 | -0.926 |  0.339 |  0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ | -0.120 | -1.869 |  4.616 |  0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.514 |  0.499 |  0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.214 |  0.177 |  0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.499 |  0.529 |  0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ | -0.004 | -0.171 |  0.556 |  0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.642 |  0.598 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.125 |  0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.592 |  0.325 |  0.794 |  0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.008 | -0.649 |  0.445 |  0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.485 |  0.457 |  0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.053 | -0.240 |  0.171 |  0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.462 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.177 |  0.268 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.690 |  0.498 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.007 | -0.270 |  0.472 |  0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.864 |  0.187 |  1.221 |  0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ | -0.146 | -1.128 |  0.299 |  0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.241 | -1.607 |  8.958 |  0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.561 |  0.538 |  0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.198 |  0.222 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.475 |  0.479 |  0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ | -0.006 | -0.295 |  0.341 |  0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -0.961 |  0.789 |  0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.105 |  0.143 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.653 |  0.401 |  0.810 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.009 | -0.767 |  0.367 |  0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.486 |  0.499 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.056 | -0.185 |  0.147 |  0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.548 |  0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.231 |  0.177 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.578 |  0.609 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.350 |  0.216 |  0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.848 |  0.172 |  1.107 |  0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ | -0.168 | -1.123 |  0.330 |  0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ | -0.074 | -1.239 |  4.293 |  0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ | -0.001 | -0.643 |  0.531 |  0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.220 |  0.376 |  0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.529 |  0.479 |  0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.230 |  0.295 |  0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -0.726 |  0.768 |  0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.167 |  0.193 |  0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.695 |  0.334 |  0.833 |  0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.012 | -0.755 |  0.517 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.474 |  0.480 |  0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.049 | -0.218 |  0.148 |  0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.542 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.006 | -0.245 |  0.239 |  0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.541 |  0.485 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.318 |  0.170 |  0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.903 |  0.178 |  1.124 |  0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ | -0.138 | -1.223 |  0.440 |  0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.164 | -1.383 |  5.910 |  0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.526 |  0.496 |  0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.250 |  0.273 |  0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.447 |  0.524 |  0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.243 |  0.256 |  0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.551 |  0.730 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.145 |  0.126 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.707 |  0.319 |  0.855 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.013 | -0.839 |  0.507 |  0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.509 |  0.508 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.051 | -0.219 |  0.155 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.475 |  0.592 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.162 |  0.220 |  0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.465 |  0.528 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.243 |  0.286 |  0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.948 |  0.220 |  1.175 |  0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ | -0.125 | -1.093 |  0.385 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ | -0.150 | -1.632 |  4.522 |  0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.636 |  0.543 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.254 |  0.262 |  0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.001 | -0.632 |  0.628 |  0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ | -0.005 | -0.240 |  0.330 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.476 |  0.479 |  0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.112 |  0.134 |  0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.686 |  0.264 |  0.797 |  0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.012 | -0.889 |  0.427 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.476 |  0.478 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.051 | -0.267 |  0.180 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.506 |  0.517 |  0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.172 |  0.241 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.570 |  0.542 |  0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.003 | -0.631 |  0.395 |  0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.912 |  0.189 |  1.122 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ | -0.114 | -1.125 |  0.188 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ | -0.099 | -1.285 |  1.708 |  0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.496 |  0.540 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.003 | -0.260 |  0.228 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.511 |  0.454 |  0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.711 |  0.286 |  0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.444 |  0.454 |  0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.101 |  0.133 |  0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.668 |  0.312 |  0.800 |  0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.015 | -0.778 |  0.372 |  0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.485 |  0.469 |  0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.045 | -0.294 |  0.173 |  0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.554 |  0.540 |  0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.001 | -0.183 |  0.199 |  0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.879 |  0.824 |  0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.670 |  0.358 |  0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.253 |  0.346 |  0.068 | torch.Size([120, 120]) || stage2.linear1.weight
+ |  0.007 | -0.248 |  0.241 |  0.103 | torch.Size([120]) || stage2.linear1.bias
+ |  1.012 |  0.613 |  1.327 |  0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.019 | -0.724 |  0.685 |  0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ |  0.003 | -2.959 |  1.705 |  0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.636 |  0.617 |  0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.291 |  0.292 |  0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.002 | -0.476 |  0.512 |  0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.263 |  0.398 |  0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  0.677 |  0.521 |  0.840 |  0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.010 | -0.710 |  0.541 |  0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.540 |  0.507 |  0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.016 | -0.242 |  0.201 |  0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.519 |  0.479 |  0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.162 |  0.231 |  0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.449 |  0.494 |  0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.293 |  0.222 |  0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.053 |  0.832 |  1.269 |  0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.015 | -0.549 |  0.428 |  0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ |  0.007 | -3.099 |  1.550 |  0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.673 |  0.604 |  0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.416 |  0.391 |  0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.569 |  0.560 |  0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ |  0.004 | -0.613 |  0.428 |  0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  0.762 |  0.464 |  0.954 |  0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.005 | -0.745 |  0.381 |  0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.441 |  0.448 |  0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.019 | -0.292 |  0.460 |  0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.491 |  0.490 |  0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.285 |  0.177 |  0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.535 |  0.631 |  0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.011 | -0.765 |  0.337 |  0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.001 | -0.367 |  0.372 |  0.074 | torch.Size([120, 120]) || stage2.linear2.weight
+ |  0.009 | -0.288 |  0.342 |  0.130 | torch.Size([120]) || stage2.linear2.bias
+ |  0.000 | -0.112 |  0.093 |  0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ | -0.002 | -0.036 |  0.035 |  0.016 | torch.Size([120]) || stage2.pa_deform.bias
+ |  0.000 | -0.068 |  0.080 |  0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ | -0.009 | -0.035 |  0.023 |  0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.068 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ | -0.014 | -0.061 |  0.036 |  0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.082 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ | -0.003 | -0.075 |  0.069 |  0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.166 |  0.139 |  0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ | -0.015 | -0.090 |  0.050 |  0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.642 |  0.663 |  0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ |  0.130 | -0.171 |  0.480 |  0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.696 |  0.620 |  0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.007 | -0.337 |  0.301 |  0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ |  0.000 | -0.650 |  0.657 |  0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ |  0.013 | -0.507 |  0.451 |  0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.067 |  0.372 |  1.778 |  0.269 | torch.Size([480]) || stage3.reshape.1.weight
+ | -0.004 | -0.699 |  0.521 |  0.227 | torch.Size([480]) || stage3.reshape.1.bias
+ | -0.000 | -0.643 |  0.743 |  0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.009 | -0.176 |  0.243 |  0.079 | torch.Size([120]) || stage3.reshape.2.bias
+ |  0.785 |  0.469 |  1.029 |  0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ | -0.102 | -0.716 |  0.311 |  0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ | -0.001 | -0.340 |  0.163 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.328 |  0.302 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.004 | -0.232 |  0.189 |  0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.343 |  0.346 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ |  0.004 | -0.335 |  0.229 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.366 |  0.325 |  0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.074 |  0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.751 |  0.517 |  0.928 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.002 | -0.271 |  0.189 |  0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.371 |  0.388 |  0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.073 | -0.203 |  0.039 |  0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.400 |  0.401 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.178 |  0.128 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.410 |  0.429 |  0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.006 | -0.345 |  0.304 |  0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.816 |  0.469 |  1.015 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ | -0.103 | -0.647 |  0.225 |  0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ |  0.001 | -0.464 |  0.239 |  0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.304 |  0.359 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.173 |  0.193 |  0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.299 |  0.408 |  0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.007 | -0.511 |  0.239 |  0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.288 |  0.254 |  0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.060 |  0.054 |  0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.796 |  0.609 |  0.971 |  0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ | -0.002 | -0.327 |  0.247 |  0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.379 |  0.407 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.214 |  0.034 |  0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.391 |  0.432 |  0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.176 |  0.112 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.378 |  0.399 |  0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.009 | -0.410 |  0.306 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.854 |  0.447 |  0.995 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ | -0.086 | -0.513 |  0.198 |  0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.001 | -0.189 |  0.292 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.390 |  0.367 |  0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.310 |  0.284 |  0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.334 |  0.296 |  0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.356 |  0.299 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.276 |  0.315 |  0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.094 |  0.066 |  0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.829 |  0.673 |  1.017 |  0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.003 | -0.259 |  0.228 |  0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.410 |  0.385 |  0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.085 | -0.200 |  0.017 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.348 |  0.378 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.130 |  0.105 |  0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.346 |  0.425 |  0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.005 | -0.363 |  0.241 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.872 |  0.554 |  1.068 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ | -0.057 | -0.402 |  0.133 |  0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ |  0.003 | -0.365 |  0.217 |  0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.359 |  0.357 |  0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.265 |  0.294 |  0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.300 |  0.271 |  0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.316 |  0.215 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.370 |  0.329 |  0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.056 |  0.066 |  0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.842 |  0.631 |  0.989 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ | -0.001 | -0.216 |  0.263 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.388 |  0.391 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.087 | -0.202 |  0.032 |  0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.364 |  0.428 |  0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.137 |  0.106 |  0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.001 | -0.390 |  0.339 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.376 |  0.203 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.913 |  0.498 |  1.102 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ | -0.048 | -0.340 |  0.105 |  0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ |  0.001 | -0.706 |  0.306 |  0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.373 |  0.339 |  0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.004 | -0.301 |  0.301 |  0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.278 |  0.277 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ |  0.003 | -0.310 |  0.240 |  0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.350 |  0.322 |  0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.064 |  0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.862 |  0.679 |  0.990 |  0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ | -0.004 | -0.313 |  0.190 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.370 |  0.364 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.092 | -0.231 |  0.129 |  0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.375 |  0.511 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.114 |  0.114 |  0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.389 |  0.354 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.005 | -0.258 |  0.164 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.899 |  0.480 |  1.089 |  0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ | -0.030 | -0.257 |  0.115 |  0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ |  0.003 | -0.462 |  0.290 |  0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.391 |  0.365 |  0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.004 | -0.232 |  0.302 |  0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.267 |  0.293 |  0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.250 |  0.182 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.238 |  0.257 |  0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.032 |  0.033 |  0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.864 |  0.651 |  1.029 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ | -0.003 | -0.212 |  0.175 |  0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.378 |  0.379 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.097 | -0.308 |  0.026 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.578 |  0.401 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.166 |  0.131 |  0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.358 |  0.376 |  0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -0.262 |  0.176 |  0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.003 | -0.284 |  0.467 |  0.071 | torch.Size([120, 120]) || stage3.linear1.weight
+ |  0.006 | -0.201 |  0.269 |  0.090 | torch.Size([120]) || stage3.linear1.bias
+ |  0.877 |  0.568 |  1.197 |  0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.002 | -0.248 |  0.324 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.261 |  0.125 |  0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.563 |  0.552 |  0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.257 |  0.302 |  0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.390 |  0.385 |  0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.450 |  0.235 |  0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  0.986 |  0.755 |  1.165 |  0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.260 |  0.169 |  0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.355 |  0.397 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.046 | -0.220 |  0.086 |  0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.424 |  0.368 |  0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.111 |  0.122 |  0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.354 |  0.374 |  0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.374 |  0.272 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.919 |  0.643 |  1.132 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.177 |  0.181 |  0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.332 |  0.131 |  0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.418 |  0.362 |  0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.375 |  0.347 |  0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.294 |  0.354 |  0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ |  0.003 | -0.432 |  0.259 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.012 |  0.750 |  1.178 |  0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ | -0.001 | -0.171 |  0.155 |  0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.331 |  0.356 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.035 | -0.207 |  0.197 |  0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.399 |  0.398 |  0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.111 |  0.129 |  0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.353 |  0.330 |  0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.328 |  0.127 |  0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.003 | -0.289 |  0.519 |  0.073 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.002 | -0.318 |  0.371 |  0.144 | torch.Size([120]) || stage3.linear2.bias
+ | -0.000 | -0.086 |  0.095 |  0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ | -0.002 | -0.023 |  0.021 |  0.010 | torch.Size([120]) || stage3.pa_deform.bias
+ | -0.000 | -0.060 |  0.056 |  0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ | -0.008 | -0.035 |  0.019 |  0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.064 |  0.062 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ | -0.007 | -0.044 |  0.031 |  0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.062 |  0.063 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ | -0.006 | -0.052 |  0.043 |  0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.081 |  0.080 |  0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ | -0.004 | -0.087 |  0.083 |  0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.465 |  0.513 |  0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ |  0.059 | -0.251 |  0.595 |  0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ | -0.000 | -0.544 |  0.531 |  0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.001 | -0.589 |  0.433 |  0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ | -0.000 | -0.535 |  0.562 |  0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ | -0.001 | -0.401 |  0.342 |  0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  0.997 |  0.921 |  1.125 |  0.028 | torch.Size([480]) || stage4.reshape.1.weight
+ | -0.000 | -0.058 |  0.059 |  0.022 | torch.Size([480]) || stage4.reshape.1.bias
+ |  0.000 | -0.155 |  0.150 |  0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ |  0.001 | -0.016 |  0.016 |  0.006 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.002 |  0.999 |  1.009 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.071 |  0.066 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.093 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.009 |  0.009 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.097 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.035 |  0.027 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.080 |  0.079 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.087 |  0.092 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.080 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.031 |  0.029 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.002 |  0.997 |  1.007 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.066 |  0.065 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.080 |  0.083 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ | -0.000 | -0.027 |  0.029 |  0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.077 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.006 |  0.009 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.080 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.084 |  0.075 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.034 |  0.031 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.996 |  1.008 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ |  0.001 | -0.070 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.007 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ | -0.000 | -0.023 |  0.026 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.107 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.000 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.076 |  0.077 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.005 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -2.000 |  0.081 |  0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.084 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.027 |  0.024 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.999 |  1.012 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.064 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.099 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.083 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ | -0.000 | -0.019 |  0.018 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.079 |  0.084 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.087 |  0.076 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.079 |  0.082 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.011 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ | -0.001 | -0.004 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.089 |  0.081 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.080 |  0.085 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.000 | -0.021 |  0.016 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.082 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.086 |  0.080 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.084 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.081 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.018 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.003 |  0.997 |  1.014 |  0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ | -0.001 | -0.005 |  0.004 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ | -0.001 | -0.070 |  0.069 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.097 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.089 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.016 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.083 |  0.091 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.093 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.000 | -0.002 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.086 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.079 |  0.092 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.000 | -0.012 |  0.016 |  0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.090 |  0.111 |  0.024 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.001 | -0.019 |  0.029 |  0.009 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  0.999 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.084 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.079 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ |  0.000 | -0.021 |  0.024 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.072 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.102 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.024 |  0.020 |  0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.998 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.078 |  0.096 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.020 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.085 |  0.082 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.000 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.078 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.092 |  0.112 |  0.023 | torch.Size([120, 120]) || stage4.linear2.weight
+ |  0.000 | -0.032 |  0.049 |  0.015 | torch.Size([120]) || stage4.linear2.bias
+ |  0.000 | -0.036 |  0.037 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.022 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ |  0.002 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ | -0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.003 |  0.002 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.172 |  0.177 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ |  0.002 | -0.027 |  0.088 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ |  0.000 | -0.212 |  0.163 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ |  0.000 | -0.066 |  0.081 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ |  0.000 | -0.413 |  0.387 |  0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ | -0.001 | -0.198 |  0.214 |  0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  0.979 |  0.896 |  1.076 |  0.053 | torch.Size([30]) || stage5.reshape.1.weight
+ | -0.005 | -0.074 |  0.100 |  0.043 | torch.Size([30]) || stage5.reshape.1.bias
+ |  0.000 | -0.240 |  0.249 |  0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ | -0.002 | -0.286 |  0.229 |  0.080 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.001 |  0.993 |  1.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ | -0.004 | -0.018 |  0.006 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.066 |  0.062 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.086 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.012 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.166 |  0.172 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.053 |  0.045 |  0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.090 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.999 |  0.987 |  1.001 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.094 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.022 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.082 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.014 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.075 |  0.083 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.073 |  0.078 |  0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.994 |  1.007 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ | -0.004 | -0.016 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.065 |  0.063 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.077 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.022 |  0.017 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.113 |  0.098 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.058 |  0.045 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.080 |  0.080 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.008 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.999 |  0.982 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.006 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.076 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.017 |  0.014 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.080 |  0.086 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.014 |  0.016 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.096 |  0.079 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.039 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.009 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ | -0.004 | -0.014 |  0.003 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.067 |  0.073 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.085 |  0.087 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.015 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.108 |  0.095 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ | -0.001 | -0.043 |  0.039 |  0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.088 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.009 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.999 |  0.978 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.003 |  0.004 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.076 |  0.081 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.012 |  0.019 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.079 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.014 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.082 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.000 | -0.047 |  0.043 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.978 |  1.015 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ | -0.004 | -0.013 |  0.004 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.084 |  0.070 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.078 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.123 |  0.132 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.001 | -0.028 |  0.044 |  0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.082 |  0.089 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.999 |  0.974 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.008 |  0.010 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.088 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.000 | -0.014 |  0.019 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.031 |  0.020 |  0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.106 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.046 |  0.042 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.003 |  0.944 |  1.017 |  0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ | -0.005 | -0.015 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.071 |  0.067 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.085 |  0.090 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.021 |  0.013 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.130 |  0.089 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.036 |  0.024 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.086 |  0.076 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.999 |  0.967 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.006 |  0.007 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.080 |  0.085 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.001 | -0.015 |  0.010 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.000 | -0.020 |  0.018 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.037 |  0.050 |  0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.004 |  0.976 |  1.039 |  0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ | -0.005 | -0.015 |  0.005 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.070 |  0.076 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.099 |  0.097 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.011 |  0.012 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.084 |  0.093 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.038 |  0.035 |  0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.087 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.010 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.998 |  0.960 |  1.002 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.088 |  0.095 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.000 | -0.014 |  0.027 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.074 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.025 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.100 |  0.086 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.030 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.102 |  0.117 |  0.023 | torch.Size([120, 120]) || stage5.linear1.weight
+ | -0.003 | -0.297 |  0.242 |  0.084 | torch.Size([120]) || stage5.linear1.bias
+ |  0.999 |  0.971 |  1.008 |  0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ | -0.000 | -0.035 |  0.034 |  0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.079 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.087 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.028 |  0.018 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.079 |  0.082 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.146 |  0.171 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  0.997 |  0.967 |  1.003 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.073 |  0.089 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.017 |  0.008 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.084 |  0.073 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.011 |  0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.103 |  0.140 |  0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.999 |  0.986 |  1.010 |  0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.035 |  0.034 |  0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.087 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.084 |  0.079 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.024 |  0.024 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.078 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.112 |  0.144 |  0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  0.998 |  0.965 |  1.004 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 | -0.004 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.088 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.012 |  0.015 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.102 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.012 |  0.009 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.075 |  0.078 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.105 |  0.131 |  0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.220 |  0.209 |  0.035 | torch.Size([120, 120]) || stage5.linear2.weight
+ | -0.003 | -0.335 |  0.284 |  0.096 | torch.Size([120]) || stage5.linear2.bias
+ | -0.000 | -0.064 |  0.065 |  0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.001 | -0.050 |  0.050 |  0.029 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.119 |  0.106 |  0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ | -0.006 | -0.030 |  0.026 |  0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.055 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.033 |  0.031 |  0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ |  0.001 | -0.060 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ | -0.005 | -0.040 |  0.037 |  0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.038 |  0.051 |  0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.048 |  0.050 |  0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.334 |  0.340 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.037 | -0.050 |  0.294 |  0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ | -0.000 | -0.343 |  0.349 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ | -0.001 | -0.237 |  0.244 |  0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ | -0.000 | -0.575 |  0.591 |  0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.001 | -0.404 |  0.344 |  0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.254 |  1.058 |  1.466 |  0.126 | torch.Size([30]) || stage6.reshape.1.weight
+ | -0.001 | -0.074 |  0.093 |  0.041 | torch.Size([30]) || stage6.reshape.1.bias
+ |  0.000 | -0.734 |  0.625 |  0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ |  0.003 | -0.269 |  0.341 |  0.108 | torch.Size([120]) || stage6.reshape.2.bias
+ |  0.815 |  0.495 |  1.118 |  0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ | -0.071 | -0.291 |  0.263 |  0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.080 |  0.087 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.136 |  0.134 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.061 |  0.037 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.201 |  0.182 |  0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.223 |  0.189 |  0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.184 |  0.211 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.049 |  0.069 |  0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.710 |  0.556 |  0.893 |  0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ | -0.003 | -0.172 |  0.193 |  0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.217 |  0.211 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.041 | -0.158 |  0.025 |  0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.209 |  0.178 |  0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.141 |  0.186 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.245 |  0.347 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.005 | -0.161 |  0.188 |  0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.780 |  0.582 |  0.963 |  0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ | -0.112 | -0.302 |  0.103 |  0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.101 |  0.072 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.112 |  0.178 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.034 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.223 |  0.242 |  0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.149 |  0.105 |  0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.199 |  0.173 |  0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.000 | -0.035 |  0.056 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.744 |  0.530 |  0.917 |  0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.004 | -0.131 |  0.180 |  0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.243 |  0.294 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.039 | -0.217 |  0.045 |  0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.206 |  0.178 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.129 |  0.125 |  0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.236 |  0.276 |  0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.158 |  0.170 |  0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.829 |  0.586 |  1.007 |  0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ | -0.101 | -0.353 |  0.132 |  0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.082 |  0.076 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.154 |  0.143 |  0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.041 |  0.038 |  0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.187 |  0.202 |  0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.096 |  0.127 |  0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.203 |  0.185 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.768 |  0.491 |  0.904 |  0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.001 | -0.146 |  0.159 |  0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.184 |  0.204 |  0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.043 | -0.185 |  0.020 |  0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.188 |  0.270 |  0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.152 |  0.134 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.222 |  0.217 |  0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.141 |  0.144 |  0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.820 |  0.554 |  0.976 |  0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ | -0.091 | -0.336 |  0.137 |  0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.124 |  0.222 |  0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.157 |  0.175 |  0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.049 |  0.049 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.238 |  0.236 |  0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.077 |  0.074 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.212 |  0.265 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.028 |  0.052 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.768 |  0.530 |  0.903 |  0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.002 | -0.104 |  0.157 |  0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.197 |  0.220 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.042 | -0.155 |  0.043 |  0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.166 |  0.199 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.102 |  0.138 |  0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.241 |  0.256 |  0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.123 |  0.115 |  0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.817 |  0.631 |  0.918 |  0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ | -0.082 | -0.295 |  0.141 |  0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.084 |  0.205 |  0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.174 |  0.199 |  0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.060 |  0.081 |  0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.194 |  0.191 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.001 | -0.083 |  0.077 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.218 |  0.243 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.031 |  0.024 |  0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.744 |  0.478 |  0.913 |  0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ | -0.003 | -0.146 |  0.110 |  0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.223 |  0.238 |  0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.046 | -0.200 |  0.071 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.168 |  0.201 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.128 |  0.141 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.220 |  0.205 |  0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.086 |  0.094 |  0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.754 |  0.353 |  0.933 |  0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ | -0.058 | -0.246 |  0.105 |  0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.113 |  0.536 |  0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.261 |  0.224 |  0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.002 | -0.050 |  0.067 |  0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.234 |  0.256 |  0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ |  0.002 | -0.079 |  0.076 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.211 |  0.231 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.033 |  0.030 |  0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.677 |  0.275 |  0.833 |  0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.001 | -0.224 |  0.306 |  0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.196 |  0.211 |  0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.061 | -0.289 |  0.136 |  0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.271 |  0.312 |  0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.003 | -0.166 |  0.155 |  0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.286 |  0.375 |  0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.005 | -0.054 |  0.137 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.174 |  0.172 |  0.039 | torch.Size([120, 120]) || stage6.linear1.weight
+ |  0.002 | -0.275 |  0.348 |  0.113 | torch.Size([120]) || stage6.linear1.bias
+ |  0.704 |  0.402 |  1.002 |  0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.001 | -0.466 |  0.407 |  0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.172 |  0.570 |  0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.337 |  0.378 |  0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.071 |  0.068 |  0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.290 |  0.321 |  0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ |  0.001 | -0.255 |  0.250 |  0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  0.695 |  0.353 |  0.966 |  0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ | -0.001 | -0.218 |  0.165 |  0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.259 |  0.255 |  0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.044 | -0.256 |  0.042 |  0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.234 |  0.214 |  0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.133 |  0.091 |  0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.333 |  0.296 |  0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.238 |  0.280 |  0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.671 |  0.425 |  0.980 |  0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.001 | -0.261 |  0.305 |  0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.372 |  0.942 |  0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.450 |  0.494 |  0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.133 |  0.119 |  0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.239 |  0.288 |  0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.187 |  0.157 |  0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  0.687 |  0.160 |  0.907 |  0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ | -0.002 | -0.192 |  0.222 |  0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.257 |  0.426 |  0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.064 | -0.207 |  0.036 |  0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.269 |  0.224 |  0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.126 |  0.129 |  0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.308 |  0.298 |  0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.180 |  0.192 |  0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.297 |  0.368 |  0.069 | torch.Size([120, 120]) || stage6.linear2.weight
+ |  0.001 | -0.431 |  0.480 |  0.189 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.100 |  0.104 |  0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.001 | -0.018 |  0.029 |  0.010 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.105 |  0.111 |  0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ | -0.007 | -0.033 |  0.024 |  0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.071 |  0.067 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.003 | -0.061 |  0.043 |  0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.074 |  0.068 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.075 |  0.056 |  0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.124 |  0.108 |  0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ | -0.001 | -0.113 |  0.076 |  0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.001 | -0.517 |  0.524 |  0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ |  0.154 | -0.305 |  0.679 |  0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ |  0.000 | -0.680 |  0.728 |  0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ |  0.020 | -0.514 |  0.417 |  0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.587 |  0.737 |  0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.015 | -0.437 |  0.490 |  0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.284 |  1.119 |  1.404 |  0.055 | torch.Size([30]) || stage7.reshape.1.weight
+ | -0.014 | -0.286 |  0.184 |  0.122 | torch.Size([30]) || stage7.reshape.1.bias
+ | -0.000 | -0.521 |  0.576 |  0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ |  0.004 | -0.387 |  0.738 |  0.175 | torch.Size([120]) || stage7.reshape.2.bias
+ |  0.440 |  0.099 |  0.775 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ | -0.177 | -0.670 |  0.319 |  0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.055 | -2.159 |  1.979 |  0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.535 |  0.554 |  0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.193 |  0.281 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.397 |  0.395 |  0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.232 |  0.692 |  0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.899 |  1.073 |  0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.122 |  0.104 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.310 |  0.157 |  0.440 |  0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.006 | -0.474 |  0.266 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.605 |  0.490 |  0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.101 | -0.310 |  0.126 |  0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.448 |  0.475 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.006 | -0.185 |  0.215 |  0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.465 |  0.512 |  0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.150 |  0.417 |  0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.577 |  0.165 |  0.829 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ | -0.136 | -0.849 |  0.206 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.143 | -3.020 |  4.621 |  0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.647 |  0.640 |  0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.356 |  0.382 |  0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.457 |  0.378 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.250 |  0.707 |  0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ | -0.001 | -1.055 |  1.091 |  0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.093 |  0.123 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.411 |  0.265 |  0.535 |  0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.008 | -0.630 |  0.264 |  0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.501 |  0.506 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.087 | -0.341 |  0.140 |  0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.450 |  0.527 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.188 |  0.171 |  0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.554 |  0.546 |  0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.135 |  0.220 |  0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.655 |  0.134 |  0.896 |  0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ | -0.139 | -0.788 |  0.181 |  0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ | -0.062 | -3.469 |  3.276 |  0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.592 |  0.650 |  0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.308 |  0.218 |  0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.355 |  0.345 |  0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.213 |  0.700 |  0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -1.166 |  0.942 |  0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.106 |  0.093 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.466 |  0.317 |  0.565 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.014 | -0.657 |  0.280 |  0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.541 |  0.494 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.079 | -0.335 |  0.122 |  0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.513 |  0.493 |  0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.180 |  0.175 |  0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.509 |  0.479 |  0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.004 | -0.093 |  0.293 |  0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.693 |  0.147 |  0.945 |  0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ | -0.132 | -0.906 |  0.249 |  0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.108 | -3.576 |  4.241 |  0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.945 |  1.095 |  0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.274 |  0.204 |  0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.379 |  0.351 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.211 |  0.587 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -1.269 |  1.067 |  0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.117 |  0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.499 |  0.285 |  0.570 |  0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.012 | -0.567 |  0.273 |  0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.528 |  0.499 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.084 | -0.349 |  0.141 |  0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.547 |  0.592 |  0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.154 |  0.176 |  0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.001 | -0.520 |  0.480 |  0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.150 |  0.207 |  0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.726 |  0.137 |  1.004 |  0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ | -0.122 | -0.907 |  0.180 |  0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ | -0.078 | -3.824 |  4.241 |  0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -1.188 |  0.796 |  0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.002 | -0.248 |  0.207 |  0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.001 | -0.409 |  0.369 |  0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ |  0.002 | -0.224 |  0.322 |  0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -1.744 |  1.273 |  0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.092 |  0.113 |  0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.514 |  0.277 |  0.614 |  0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.016 | -0.621 |  0.286 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.517 |  0.453 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.064 | -0.260 |  0.143 |  0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.554 |  0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.004 | -0.232 |  0.193 |  0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.595 |  0.543 |  0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.196 |  0.198 |  0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.731 |  0.152 |  1.075 |  0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ | -0.076 | -1.003 |  0.176 |  0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ | -0.121 | -3.281 |  4.671 |  0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.640 |  1.083 |  0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.001 | -0.239 |  0.314 |  0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.001 | -0.344 |  0.452 |  0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.004 | -0.361 |  0.251 |  0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.637 |  0.806 |  0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.088 |  0.091 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.514 |  0.238 |  0.594 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.017 | -0.650 |  0.162 |  0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.442 |  0.479 |  0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.040 | -0.400 |  0.203 |  0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.541 |  0.514 |  0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.008 | -0.319 |  0.309 |  0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -1.018 |  1.398 |  0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.606 |  0.269 |  0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.186 |  0.207 |  0.048 | torch.Size([120, 120]) || stage7.linear1.weight
+ |  0.010 | -0.448 |  0.437 |  0.161 | torch.Size([120]) || stage7.linear1.bias
+ |  0.703 |  0.381 |  0.856 |  0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.014 | -0.645 |  0.486 |  0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ | -0.007 | -4.468 |  1.008 |  0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.625 |  0.834 |  0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.009 | -0.737 |  0.632 |  0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.403 |  0.406 |  0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ | -0.007 | -0.338 |  0.165 |  0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  0.435 |  0.323 |  0.526 |  0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.005 | -0.678 |  0.379 |  0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.465 |  0.467 |  0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.031 | -0.236 |  0.180 |  0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.490 |  0.520 |  0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.197 |  0.242 |  0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.525 |  0.501 |  0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.431 |  0.164 |  0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.703 |  0.306 |  0.866 |  0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.009 | -0.647 |  0.481 |  0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ | -0.010 | -3.504 |  1.842 |  0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.639 |  0.590 |  0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.613 |  0.609 |  0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.316 |  0.325 |  0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ | -0.004 | -0.350 |  0.145 |  0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  0.452 |  0.309 |  0.558 |  0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.003 | -0.661 |  0.246 |  0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.580 |  0.410 |  0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.020 | -0.258 |  0.299 |  0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.561 |  0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.234 |  0.434 |  0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.778 |  0.581 |  0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.888 |  0.286 |  0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.348 |  0.237 |  0.060 | torch.Size([120, 120]) || stage7.linear2.weight
+ |  0.023 | -0.390 |  0.506 |  0.167 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.104 |  0.107 |  0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.002 | -0.041 |  0.035 |  0.016 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.123 |  0.109 |  0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.034 |  0.032 |  0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.111 |  0.084 |  0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.073 |  0.081 |  0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ | -0.002 | -0.154 |  0.122 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.014 | -0.041 |  0.068 |  0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ | -0.001 | -0.408 |  0.365 |  0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.057 |  0.054 |  0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.697 |  0.606 |  0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.119 | -0.211 |  0.720 |  0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ |  0.000 | -1.175 |  0.924 |  0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ | -0.000 | -0.581 |  0.580 |  0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ |  0.001 | -0.786 |  0.874 |  0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ | -0.053 | -0.522 |  0.577 |  0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.225 |  1.000 |  1.516 |  0.095 | torch.Size([120]) || stage8.0.1.weight
+ | -0.013 | -0.413 |  0.465 |  0.139 | torch.Size([120]) || stage8.0.1.bias
+ |  0.000 | -2.505 |  0.627 |  0.136 | torch.Size([180, 120]) || stage8.0.2.weight
+ |  0.005 | -0.397 |  0.377 |  0.107 | torch.Size([180]) || stage8.0.2.bias
+ |  0.456 |  0.123 |  0.760 |  0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ | -0.022 | -0.343 |  0.875 |  0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ | -0.014 | -1.907 |  2.592 |  0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.632 |  0.628 |  0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.006 | -0.567 |  0.668 |  0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.477 |  0.447 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ | -0.010 | -0.460 |  0.225 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  0.429 |  0.119 |  0.634 |  0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ | -0.007 | -0.338 |  0.803 |  0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.572 |  0.539 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ | -0.060 | -0.260 |  0.185 |  0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.461 |  0.548 |  0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.163 |  0.183 |  0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.757 |  0.581 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.191 |  0.121 |  0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  0.557 |  0.086 |  0.800 |  0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ | -0.029 | -0.230 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ | -0.016 | -2.004 |  1.711 |  0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.690 |  0.575 |  0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.011 | -0.641 |  0.609 |  0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.466 |  0.401 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ | -0.008 | -0.344 |  0.181 |  0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  0.503 |  0.226 |  0.742 |  0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.404 |  0.818 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.007 | -0.595 |  0.532 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ | -0.068 | -0.261 |  0.071 |  0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.573 |  0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.129 |  0.197 |  0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.556 |  0.582 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.170 |  0.145 |  0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  0.699 |  0.202 |  0.912 |  0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.253 |  0.924 |  0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ | -0.030 | -2.510 |  2.088 |  0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.637 |  0.801 |  0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.006 | -0.512 |  0.520 |  0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.381 |  0.337 |  0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ | -0.011 | -0.238 |  0.234 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  0.594 |  0.150 |  0.810 |  0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ | -0.010 | -0.483 |  0.726 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.567 |  0.499 |  0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ | -0.077 | -0.360 |  0.050 |  0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.536 |  0.673 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.142 |  0.186 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.536 |  0.524 |  0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ | -0.006 | -0.147 |  0.133 |  0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  0.683 |  0.141 |  0.908 |  0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.199 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ | -0.039 | -1.527 |  3.891 |  0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.682 |  0.693 |  0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.007 | -0.543 |  0.513 |  0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.390 |  0.476 |  0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ | -0.007 | -0.176 |  0.150 |  0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  0.640 |  0.094 |  0.853 |  0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ | -0.009 | -0.372 |  0.683 |  0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.006 | -0.628 |  0.521 |  0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ | -0.089 | -0.367 |  0.047 |  0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.629 |  0.562 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.186 |  0.128 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.485 |  0.499 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.138 |  0.209 |  0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.294 |  0.577 |  0.071 | torch.Size([180, 180]) || stage8.1.linear.weight
+ |  0.004 | -0.349 |  0.235 |  0.072 | torch.Size([180]) || stage8.1.linear.bias
+ |  0.708 |  0.242 |  1.026 |  0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ | -0.032 | -0.212 |  0.830 |  0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ | -0.039 | -1.954 |  2.394 |  0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.922 |  0.646 |  0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.429 |  0.524 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.467 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ | -0.005 | -0.339 |  0.264 |  0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  0.587 |  0.255 |  0.837 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ | -0.011 | -0.285 |  0.721 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.586 |  0.534 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.075 | -0.225 |  0.066 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.493 |  0.532 |  0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.189 |  0.178 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.551 |  0.543 |  0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.010 | -0.154 |  0.142 |  0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  0.773 |  0.210 |  1.004 |  0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ | -0.035 | -0.176 |  0.873 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ | -0.027 | -2.407 |  1.736 |  0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.817 |  0.977 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.659 |  0.461 |  0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.484 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ | -0.014 | -0.315 |  0.252 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  0.641 |  0.337 |  0.810 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ | -0.011 | -0.177 |  0.806 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ | -0.006 | -0.569 |  0.598 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.079 | -0.323 |  0.071 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.512 |  0.577 |  0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.142 |  0.161 |  0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.529 |  0.572 |  0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ | -0.010 | -0.178 |  0.159 |  0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  0.857 |  0.199 |  1.153 |  0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ | -0.039 | -0.189 |  0.943 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.042 | -1.962 |  2.773 |  0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.783 |  0.655 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.004 | -0.338 |  0.533 |  0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.497 |  0.461 |  0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ | -0.008 | -0.288 |  0.183 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  0.681 |  0.327 |  0.878 |  0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ | -0.012 | -0.178 |  0.773 |  0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.789 |  0.546 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ | -0.081 | -0.249 |  0.036 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.526 |  0.555 |  0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.133 |  0.191 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.572 |  0.529 |  0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.164 |  0.147 |  0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  0.877 |  0.198 |  1.043 |  0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ | -0.038 | -0.210 |  0.916 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ | -0.094 | -2.974 |  4.987 |  0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.964 |  1.011 |  0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.404 |  0.429 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.501 |  0.489 |  0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ | -0.021 | -0.305 |  0.208 |  0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  0.697 |  0.295 |  0.894 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ | -0.015 | -0.241 |  0.712 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.562 |  0.573 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ | -0.085 | -0.302 |  0.080 |  0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.734 |  0.573 |  0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.150 |  0.161 |  0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.671 |  0.623 |  0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ | -0.023 | -0.252 |  0.317 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.278 |  0.345 |  0.064 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.004 | -0.315 |  0.148 |  0.064 | torch.Size([180]) || stage8.2.linear.bias
+ |  0.850 |  0.326 |  1.087 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ | -0.031 | -0.334 |  0.779 |  0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.012 | -2.917 |  1.476 |  0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.603 |  0.666 |  0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.374 |  0.381 |  0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.577 |  0.605 |  0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ | -0.008 | -0.394 |  0.499 |  0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  0.636 |  0.321 |  0.790 |  0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ | -0.013 | -0.294 |  0.774 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ | -0.004 | -0.540 |  0.539 |  0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ | -0.065 | -0.212 |  0.047 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.608 |  0.603 |  0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.177 |  0.155 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.573 |  0.630 |  0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.189 |  0.178 |  0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  0.899 |  0.275 |  1.048 |  0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ | -0.031 | -0.223 |  0.771 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ | -0.003 | -3.151 |  1.718 |  0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.732 |  0.868 |  0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.412 |  0.350 |  0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.466 |  0.487 |  0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ | -0.006 | -0.388 |  0.400 |  0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  0.711 |  0.381 |  0.864 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.240 |  0.692 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ | -0.005 | -0.657 |  0.639 |  0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.263 |  0.047 |  0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.673 |  0.605 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.158 |  0.155 |  0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.582 |  0.585 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.009 | -0.253 |  0.178 |  0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  0.941 |  0.262 |  1.154 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ | -0.032 | -0.162 |  0.906 |  0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ | -0.005 | -3.421 |  1.350 |  0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.777 |  0.735 |  0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.355 |  0.421 |  0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.479 |  0.475 |  0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ | -0.013 | -0.292 |  0.345 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  0.743 |  0.242 |  0.919 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ | -0.011 | -0.214 |  0.691 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.005 | -0.633 |  0.498 |  0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ | -0.082 | -0.346 |  0.087 |  0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.591 |  0.670 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.190 |  0.151 |  0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.560 |  0.637 |  0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ | -0.009 | -0.226 |  0.250 |  0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  0.950 |  0.250 |  1.103 |  0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.196 |  0.925 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ | -0.026 | -3.591 |  5.653 |  0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.753 |  0.637 |  0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.333 |  0.432 |  0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.001 | -0.591 |  0.591 |  0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ | -0.014 | -0.348 |  0.267 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  0.735 |  0.254 |  0.893 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ | -0.011 | -0.241 |  0.659 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.628 |  0.667 |  0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ | -0.076 | -0.411 |  0.113 |  0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.662 |  0.578 |  0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ | -0.004 | -0.208 |  0.169 |  0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.602 |  0.588 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ | -0.011 | -0.218 |  0.232 |  0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.343 |  0.316 |  0.065 | torch.Size([180, 180]) || stage8.3.linear.weight
+ |  0.010 | -0.297 |  0.187 |  0.061 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.012 |  0.330 |  1.282 |  0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ | -0.030 | -0.347 |  0.800 |  0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ | -0.013 | -2.816 |  3.792 |  0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.807 |  0.825 |  0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.429 |  0.319 |  0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.553 |  0.569 |  0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ | -0.019 | -0.443 |  0.441 |  0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  0.638 |  0.420 |  0.797 |  0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ | -0.018 | -0.222 |  0.886 |  0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.576 |  0.510 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ | -0.018 | -0.277 |  0.123 |  0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.687 |  0.625 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ | -0.007 | -0.264 |  0.267 |  0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.639 |  0.705 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.012 | -0.255 |  0.274 |  0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.092 |  0.475 |  1.341 |  0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ | -0.030 | -0.294 |  0.686 |  0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.018 | -3.165 |  0.990 |  0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.695 |  0.699 |  0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.319 |  0.286 |  0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.542 |  0.519 |  0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ | -0.017 | -0.439 |  0.451 |  0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  0.664 |  0.366 |  0.835 |  0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ | -0.015 | -0.217 |  0.985 |  0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.641 |  0.563 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ | -0.022 | -0.381 |  0.161 |  0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.571 |  0.642 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.279 |  0.311 |  0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.738 |  0.633 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ | -0.007 | -0.254 |  0.261 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.125 |  0.525 |  1.405 |  0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.186 |  0.627 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ |  0.028 | -3.477 |  0.957 |  0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.663 |  0.658 |  0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.007 | -0.357 |  0.255 |  0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.596 |  0.578 |  0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ | -0.018 | -0.506 |  0.389 |  0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  0.694 |  0.319 |  0.865 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ | -0.018 | -0.150 |  0.975 |  0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.619 |  0.565 |  0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ | -0.025 | -0.345 |  0.208 |  0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.624 |  0.607 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.388 |  0.290 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.927 |  0.675 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.325 |  0.240 |  0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.108 |  0.535 |  1.297 |  0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.213 |  0.546 |  0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ |  0.020 | -3.042 |  1.420 |  0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.697 |  0.700 |  0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.311 |  0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.652 |  0.592 |  0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ | -0.019 | -0.535 |  0.426 |  0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  0.685 |  0.225 |  0.893 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.211 |  0.938 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ | -0.001 | -0.501 |  0.564 |  0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ | -0.014 | -0.339 |  0.237 |  0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.560 |  0.626 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.231 |  0.239 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.544 |  0.657 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.271 |  0.274 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.473 |  0.481 |  0.069 | torch.Size([180, 180]) || stage8.4.linear.weight
+ |  0.029 | -0.333 |  0.194 |  0.076 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.025 |  0.297 |  1.336 |  0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ | -0.034 | -0.429 |  0.872 |  0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.574 | -4.515 |  3.381 |  0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.771 |  0.886 |  0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.356 |  0.521 |  0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.632 |  0.656 |  0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ | -0.029 | -0.329 |  0.697 |  0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  0.777 |  0.446 |  0.952 |  0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ | -0.022 | -0.335 |  0.920 |  0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.520 |  0.598 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.013 | -0.456 |  0.200 |  0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.677 |  0.642 |  0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.272 |  0.233 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.762 |  0.598 |  0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.025 | -0.244 |  0.583 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.021 |  0.261 |  1.261 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ | -0.033 | -0.358 |  0.867 |  0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ | -0.550 | -3.274 |  4.406 |  0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.819 |  0.986 |  0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.005 | -0.510 |  0.446 |  0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.003 | -0.739 |  0.682 |  0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ | -0.032 | -0.318 |  0.607 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  0.823 |  0.420 |  0.950 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ | -0.021 | -0.274 |  0.882 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.496 |  0.532 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ | -0.028 | -0.260 |  0.194 |  0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.620 |  0.586 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.284 |  0.423 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.774 |  0.614 |  0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ | -0.028 | -0.371 |  0.561 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.096 |  0.377 |  1.321 |  0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.244 |  0.755 |  0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.441 | -3.439 |  5.870 |  0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.710 |  0.679 |  0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.277 |  0.283 |  0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.001 | -0.824 |  0.684 |  0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.033 | -0.390 |  0.545 |  0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  0.843 |  0.390 |  0.984 |  0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ | -0.022 | -0.211 |  0.854 |  0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.522 |  0.503 |  0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ | -0.024 | -0.243 |  0.219 |  0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ | -0.001 | -0.638 |  0.617 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.268 |  0.380 |  0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.713 |  0.769 |  0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ | -0.034 | -0.372 |  0.592 |  0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.027 |  0.318 |  1.206 |  0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.187 |  0.768 |  0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ | -0.347 | -2.664 |  2.684 |  0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.677 |  0.676 |  0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.002 | -0.410 |  0.354 |  0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.630 |  0.725 |  0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ | -0.041 | -0.385 |  0.660 |  0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  0.849 |  0.390 |  0.985 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.163 |  0.810 |  0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ | -0.002 | -0.547 |  0.536 |  0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ | -0.012 | -0.366 |  0.252 |  0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.669 |  0.597 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.216 |  0.202 |  0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.700 |  0.674 |  0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ | -0.032 | -0.376 |  0.666 |  0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.299 |  0.469 |  0.069 | torch.Size([180, 180]) || stage8.5.linear.weight
+ |  0.081 | -0.562 |  0.263 |  0.109 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.111 |  0.208 |  1.434 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ | -0.048 | -0.547 |  0.851 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ | -0.252 | -2.157 |  6.293 |  0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.664 |  0.631 |  0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.007 | -0.293 |  0.366 |  0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.701 |  0.726 |  0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ |  0.030 | -0.318 |  0.331 |  0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  0.959 |  0.475 |  1.322 |  0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ | -0.039 | -0.421 |  0.873 |  0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.550 |  0.783 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.269 |  0.152 |  0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.914 |  0.839 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.340 |  0.304 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.592 |  0.713 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.535 |  0.384 |  0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.123 |  0.183 |  1.352 |  0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ | -0.047 | -0.513 |  0.903 |  0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ | -0.234 | -1.968 |  6.366 |  0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.751 |  0.759 |  0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.300 |  0.214 |  0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.657 |  0.699 |  0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ |  0.031 | -0.321 |  0.293 |  0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  0.986 |  0.416 |  1.360 |  0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ | -0.038 | -0.393 |  0.807 |  0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ | -0.001 | -0.589 |  0.620 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.316 |  0.229 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.738 |  0.766 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.252 |  0.302 |  0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.674 |  0.629 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.475 |  0.441 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.097 |  0.342 |  1.294 |  0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ | -0.054 | -0.639 |  0.904 |  0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ | -0.135 | -3.252 |  1.238 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.672 |  0.663 |  0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.007 | -0.170 |  0.228 |  0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.001 | -0.660 |  0.651 |  0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.031 | -0.360 |  0.322 |  0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.004 |  0.360 |  1.381 |  0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ | -0.042 | -0.447 |  0.808 |  0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.600 |  0.603 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.022 | -0.447 |  0.249 |  0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.666 |  0.708 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.326 |  0.272 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.653 |  0.719 |  0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.488 |  0.321 |  0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.095 |  0.272 |  1.302 |  0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ | -0.052 | -0.557 |  1.069 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ | -0.196 | -2.349 |  1.401 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.741 |  0.657 |  0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.186 |  0.141 |  0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.669 |  0.671 |  0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.004 | -0.323 |  0.300 |  0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  0.999 |  0.383 |  1.380 |  0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ | -0.044 | -0.392 |  0.694 |  0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.577 |  0.857 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.041 | -0.394 |  0.238 |  0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.924 |  0.828 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.214 |  0.407 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.827 |  0.755 |  0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.022 | -0.296 |  0.262 |  0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ |  0.002 | -1.059 |  1.262 |  0.089 | torch.Size([180, 180]) || stage8.6.linear.weight
+ |  0.031 | -0.789 |  0.427 |  0.120 | torch.Size([180]) || stage8.6.linear.bias
+ |  0.389 |  0.079 |  1.137 |  0.176 | torch.Size([180]) || norm.weight
+ | -0.021 | -0.669 |  0.888 |  0.127 | torch.Size([180]) || norm.bias
+ |  0.000 | -0.486 |  0.568 |  0.103 | torch.Size([120, 180]) || conv_after_body.weight
+ | -0.000 | -0.167 |  0.168 |  0.055 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -1.782 |  1.300 |  0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ | -0.019 | -0.542 |  0.437 |  0.162 | torch.Size([64]) || conv_before_upsample.0.bias
+ |  0.001 | -1.915 |  1.372 |  0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ | -0.045 | -0.281 |  0.215 |  0.097 | torch.Size([256]) || upsample.0.bias
+ | -0.006 | -4.826 |  0.582 |  0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ | -0.154 | -0.441 |  0.187 |  0.100 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.210 |  0.246 |  0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.000 | -0.013 |  0.007 |  0.003 | torch.Size([64]) || upsample.10.bias
+ |  0.000 | -0.044 |  0.042 |  0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([3]) || conv_last.bias
+
+22-03-11 10:53:04.972 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/train/train_sharp
+      dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 4
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [6, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:53:05.016 : Number of train images: 24,000, iters: 3,000
+22-03-11 10:53:19.424 : 
+Networks name: VRT
+Params number: 30676435
+Net structure:
+VRT(
+  (conv_first): Conv3d(27, 120, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  (spynet): SpyNet(
+    (basic_module): ModuleList(
+      (0): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (1): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (2): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (3): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (4): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+      (5): BasicModule(
+        (basic_module): Sequential(
+          (0): Conv2d(8, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (1): ReLU()
+          (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (3): ReLU()
+          (4): Conv2d(64, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (5): ReLU()
+          (6): Conv2d(32, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+          (7): ReLU()
+          (8): Conv2d(16, 2, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
+        )
+      )
+    )
+  )
+  (stage1): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d h w -> n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): Identity()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage2): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage3): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage4): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n c d (h neih) (w neiw) -> n d h w (neiw neih c)', neih=2, neiw=2)
+      (1): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=480, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage5): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage6): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage7): Stage(
+    (reshape): Sequential(
+      (0): Rearrange('n (neiw neih c) d h w -> n d (h neih) (w neiw) c', neih=2, neiw=2)
+      (1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=30, out_features=120, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (residual_group1): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (2): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (3): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (4): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (5): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=240, out_features=120, bias=True)
+            (qkv_mut): Linear(in_features=120, out_features=360, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear1): Linear(in_features=120, out_features=120, bias=True)
+    (residual_group2): TMSAG(
+      (blocks): ModuleList(
+        (0): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (1): TMSA(
+          (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (attn): WindowAttention(
+            (qkv_self): Linear(in_features=120, out_features=360, bias=True)
+            (proj): Linear(in_features=120, out_features=120, bias=True)
+            (softmax): Softmax(dim=-1)
+          )
+          (drop_path): DropPath()
+          (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+          (mlp): Mlp_GEGLU(
+            (fc11): Linear(in_features=120, out_features=240, bias=True)
+            (fc12): Linear(in_features=120, out_features=240, bias=True)
+            (act): GELU()
+            (fc2): Linear(in_features=240, out_features=120, bias=True)
+            (drop): Dropout(p=0.0, inplace=False)
+          )
+        )
+      )
+    )
+    (linear2): Linear(in_features=120, out_features=120, bias=True)
+    (pa_deform): DCNv2PackFlowGuided(
+      (conv_offset): Sequential(
+        (0): Conv2d(242, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (1): LeakyReLU(negative_slope=0.1, inplace=True)
+        (2): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (3): LeakyReLU(negative_slope=0.1, inplace=True)
+        (4): Conv2d(120, 120, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (5): LeakyReLU(negative_slope=0.1, inplace=True)
+        (6): Conv2d(120, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+    (pa_fuse): Mlp_GEGLU(
+      (fc11): Linear(in_features=360, out_features=360, bias=True)
+      (fc12): Linear(in_features=360, out_features=360, bias=True)
+      (act): GELU()
+      (fc2): Linear(in_features=360, out_features=120, bias=True)
+      (drop): Dropout(p=0.0, inplace=False)
+    )
+  )
+  (stage8): ModuleList(
+    (0): Sequential(
+      (0): Rearrange('n c d h w ->  n d h w c')
+      (1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=120, out_features=180, bias=True)
+      (3): Rearrange('n d h w c -> n c d h w')
+    )
+    (1): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (2): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (3): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (4): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (5): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+    (6): RTMSA(
+      (residual_group): TMSAG(
+        (blocks): ModuleList(
+          (0): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (1): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (2): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (3): TMSA(
+            (norm1): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (attn): WindowAttention(
+              (qkv_self): Linear(in_features=180, out_features=540, bias=True)
+              (proj): Linear(in_features=180, out_features=180, bias=True)
+              (softmax): Softmax(dim=-1)
+            )
+            (drop_path): DropPath()
+            (norm2): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+            (mlp): Mlp_GEGLU(
+              (fc11): Linear(in_features=180, out_features=360, bias=True)
+              (fc12): Linear(in_features=180, out_features=360, bias=True)
+              (act): GELU()
+              (fc2): Linear(in_features=360, out_features=180, bias=True)
+              (drop): Dropout(p=0.0, inplace=False)
+            )
+          )
+        )
+      )
+      (linear): Linear(in_features=180, out_features=180, bias=True)
+    )
+  )
+  (norm): LayerNorm((180,), eps=1e-05, elementwise_affine=True)
+  (conv_after_body): Linear(in_features=180, out_features=120, bias=True)
+  (conv_before_upsample): Sequential(
+    (0): Conv3d(120, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): LeakyReLU(negative_slope=0.01, inplace=True)
+  )
+  (upsample): Upsample(
+    (0): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (1): Transpose_Dim12()
+    (2): PixelShuffle(upscale_factor=2)
+    (3): Transpose_Dim12()
+    (4): LeakyReLU(negative_slope=0.1, inplace=True)
+    (5): Conv3d(64, 256, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+    (6): Transpose_Dim12()
+    (7): PixelShuffle(upscale_factor=2)
+    (8): Transpose_Dim12()
+    (9): LeakyReLU(negative_slope=0.1, inplace=True)
+    (10): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+  )
+  (conv_last): Conv3d(64, 3, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
+)
+
+22-03-11 10:53:19.603 : 
+ |  mean  |  min   |  max   |  std   || shape               
+ | -0.000 | -1.462 |  1.580 |  0.103 | torch.Size([120, 27, 1, 3, 3]) || conv_first.weight
+ |  0.005 | -0.950 |  0.885 |  0.268 | torch.Size([120]) || conv_first.bias
+ |  0.449 |  0.406 |  0.485 |  0.040 | torch.Size([1, 3, 1, 1]) || spynet.mean
+ |  0.226 |  0.224 |  0.229 |  0.003 | torch.Size([1, 3, 1, 1]) || spynet.std
+ | -0.000 | -0.679 |  0.720 |  0.066 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.0.basic_module.0.weight
+ | -0.042 | -0.894 |  0.351 |  0.344 | torch.Size([32]) || spynet.basic_module.0.basic_module.0.bias
+ | -0.008 | -3.201 |  0.948 |  0.097 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.0.basic_module.2.weight
+ |  0.059 | -1.268 |  0.732 |  0.320 | torch.Size([64]) || spynet.basic_module.0.basic_module.2.bias
+ | -0.010 | -4.633 |  0.568 |  0.089 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.0.basic_module.4.weight
+ |  0.159 | -0.704 |  0.859 |  0.353 | torch.Size([32]) || spynet.basic_module.0.basic_module.4.bias
+ | -0.024 | -1.714 |  0.414 |  0.091 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.0.basic_module.6.weight
+ |  0.780 | -1.061 |  1.162 |  0.519 | torch.Size([16]) || spynet.basic_module.0.basic_module.6.bias
+ |  0.000 | -0.144 |  0.163 |  0.018 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.0.basic_module.8.weight
+ |  0.001 | -0.003 |  0.005 |  0.006 | torch.Size([2]) || spynet.basic_module.0.basic_module.8.bias
+ |  0.000 | -0.726 |  0.773 |  0.070 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.1.basic_module.0.weight
+ | -0.021 | -0.814 |  0.355 |  0.323 | torch.Size([32]) || spynet.basic_module.1.basic_module.0.bias
+ | -0.010 | -3.380 |  0.916 |  0.099 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.1.basic_module.2.weight
+ |  0.038 | -1.207 |  0.714 |  0.301 | torch.Size([64]) || spynet.basic_module.1.basic_module.2.bias
+ | -0.008 | -4.462 |  0.549 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.1.basic_module.4.weight
+ |  0.157 | -0.742 |  0.980 |  0.384 | torch.Size([32]) || spynet.basic_module.1.basic_module.4.bias
+ | -0.020 | -1.648 |  0.319 |  0.084 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.1.basic_module.6.weight
+ |  0.775 | -1.195 |  1.148 |  0.546 | torch.Size([16]) || spynet.basic_module.1.basic_module.6.bias
+ | -0.000 | -0.122 |  0.152 |  0.016 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.1.basic_module.8.weight
+ | -0.000 | -0.002 |  0.001 |  0.002 | torch.Size([2]) || spynet.basic_module.1.basic_module.8.bias
+ |  0.000 | -0.956 |  0.870 |  0.088 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.2.basic_module.0.weight
+ | -0.025 | -1.040 |  0.512 |  0.411 | torch.Size([32]) || spynet.basic_module.2.basic_module.0.bias
+ | -0.011 | -4.624 |  1.195 |  0.116 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.2.basic_module.2.weight
+ |  0.023 | -1.284 |  0.699 |  0.308 | torch.Size([64]) || spynet.basic_module.2.basic_module.2.bias
+ | -0.009 | -1.831 |  0.616 |  0.092 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.2.basic_module.4.weight
+ |  0.120 | -0.695 |  0.755 |  0.332 | torch.Size([32]) || spynet.basic_module.2.basic_module.4.bias
+ | -0.013 | -1.285 |  0.304 |  0.068 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.2.basic_module.6.weight
+ |  0.681 | -1.725 |  0.942 |  0.646 | torch.Size([16]) || spynet.basic_module.2.basic_module.6.bias
+ |  0.000 | -0.045 |  0.071 |  0.009 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.2.basic_module.8.weight
+ | -0.010 | -0.010 | -0.009 |  0.000 | torch.Size([2]) || spynet.basic_module.2.basic_module.8.bias
+ | -0.000 | -0.995 |  0.879 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.3.basic_module.0.weight
+ | -0.040 | -1.137 |  0.617 |  0.461 | torch.Size([32]) || spynet.basic_module.3.basic_module.0.bias
+ | -0.010 | -4.891 |  1.224 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.3.basic_module.2.weight
+ |  0.022 | -1.287 |  0.745 |  0.313 | torch.Size([64]) || spynet.basic_module.3.basic_module.2.bias
+ | -0.010 | -1.802 |  0.561 |  0.090 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.3.basic_module.4.weight
+ |  0.118 | -0.694 |  0.697 |  0.329 | torch.Size([32]) || spynet.basic_module.3.basic_module.4.bias
+ | -0.012 | -1.107 |  0.306 |  0.064 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.3.basic_module.6.weight
+ |  0.658 | -1.792 |  0.905 |  0.659 | torch.Size([16]) || spynet.basic_module.3.basic_module.6.bias
+ |  0.000 | -0.030 |  0.037 |  0.006 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.3.basic_module.8.weight
+ |  0.003 | -0.001 |  0.007 |  0.006 | torch.Size([2]) || spynet.basic_module.3.basic_module.8.bias
+ | -0.000 | -0.990 |  0.880 |  0.090 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.4.basic_module.0.weight
+ | -0.010 | -1.067 |  0.596 |  0.437 | torch.Size([32]) || spynet.basic_module.4.basic_module.0.bias
+ | -0.010 | -5.061 |  1.229 |  0.117 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.4.basic_module.2.weight
+ |  0.024 | -1.274 |  0.830 |  0.318 | torch.Size([64]) || spynet.basic_module.4.basic_module.2.bias
+ | -0.009 | -1.787 |  0.563 |  0.088 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.4.basic_module.4.weight
+ |  0.130 | -0.685 |  0.743 |  0.335 | torch.Size([32]) || spynet.basic_module.4.basic_module.4.bias
+ | -0.011 | -0.973 |  0.292 |  0.061 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.4.basic_module.6.weight
+ |  0.659 | -1.855 |  0.931 |  0.679 | torch.Size([16]) || spynet.basic_module.4.basic_module.6.bias
+ |  0.000 | -0.034 |  0.040 |  0.005 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.4.basic_module.8.weight
+ | -0.001 | -0.009 |  0.007 |  0.012 | torch.Size([2]) || spynet.basic_module.4.basic_module.8.bias
+ | -0.000 | -0.973 |  0.853 |  0.089 | torch.Size([32, 8, 7, 7]) || spynet.basic_module.5.basic_module.0.weight
+ |  0.022 | -1.001 |  0.571 |  0.440 | torch.Size([32]) || spynet.basic_module.5.basic_module.0.bias
+ | -0.009 | -5.095 |  1.251 |  0.119 | torch.Size([64, 32, 7, 7]) || spynet.basic_module.5.basic_module.2.weight
+ |  0.026 | -1.305 |  0.880 |  0.326 | torch.Size([64]) || spynet.basic_module.5.basic_module.2.bias
+ | -0.008 | -1.815 |  0.561 |  0.091 | torch.Size([32, 64, 7, 7]) || spynet.basic_module.5.basic_module.4.weight
+ |  0.137 | -0.711 |  0.771 |  0.342 | torch.Size([32]) || spynet.basic_module.5.basic_module.4.bias
+ | -0.010 | -0.986 |  0.286 |  0.059 | torch.Size([16, 32, 7, 7]) || spynet.basic_module.5.basic_module.6.weight
+ |  0.671 | -1.913 |  0.966 |  0.700 | torch.Size([16]) || spynet.basic_module.5.basic_module.6.bias
+ |  0.000 | -0.034 |  0.028 |  0.002 | torch.Size([2, 16, 7, 7]) || spynet.basic_module.5.basic_module.8.weight
+ |  0.002 | -0.013 |  0.016 |  0.020 | torch.Size([2]) || spynet.basic_module.5.basic_module.8.bias
+ |  1.280 |  0.669 |  1.862 |  0.274 | torch.Size([120]) || stage1.reshape.1.weight
+ | -0.006 | -0.324 |  0.337 |  0.106 | torch.Size([120]) || stage1.reshape.1.bias
+ |  0.579 |  0.129 |  1.064 |  0.236 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.weight
+ | -0.039 | -1.100 |  0.894 |  0.226 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm1.bias
+ | -0.134 | -4.020 |  2.585 |  0.295 | torch.Size([675, 6]) || stage1.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.579 |  0.618 |  0.113 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.319 |  0.279 |  0.074 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.634 |  0.686 |  0.076 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.attn.proj.weight
+ | -0.014 | -0.222 |  0.642 |  0.088 | torch.Size([120]) || stage1.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -1.066 |  0.928 |  0.097 | torch.Size([360, 120]) || stage1.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.146 |  0.190 |  0.033 | torch.Size([360]) || stage1.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.781 |  0.367 |  1.203 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.weight
+ |  0.029 | -0.378 |  0.545 |  0.159 | torch.Size([120]) || stage1.residual_group1.blocks.0.norm2.bias
+ |  0.001 | -0.687 |  0.753 |  0.108 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.010 | -0.229 |  0.633 |  0.095 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.674 |  0.669 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.011 | -0.448 |  0.368 |  0.116 | torch.Size([240]) || stage1.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.862 |  0.941 |  0.119 | torch.Size([120, 240]) || stage1.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.004 | -0.267 |  0.594 |  0.099 | torch.Size([120]) || stage1.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.797 |  0.211 |  1.475 |  0.209 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.weight
+ | -0.161 | -1.941 |  0.746 |  0.237 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm1.bias
+ | -0.296 | -3.927 |  2.840 |  0.478 | torch.Size([675, 6]) || stage1.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.1.attn.position_bias
+ |  0.001 | -1.479 |  1.395 |  0.143 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.003 | -0.381 |  0.258 |  0.063 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.526 |  0.561 |  0.079 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.178 |  0.478 |  0.078 | torch.Size([120]) || stage1.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -1.242 |  1.138 |  0.105 | torch.Size([360, 120]) || stage1.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.004 | -0.213 |  0.196 |  0.050 | torch.Size([360]) || stage1.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.702 |  0.349 |  0.904 |  0.085 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.weight
+ |  0.039 | -0.646 |  0.384 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.872 |  0.750 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.049 | -0.353 |  0.135 |  0.084 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.562 |  0.580 |  0.117 | torch.Size([240, 120]) || stage1.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.238 |  0.457 |  0.113 | torch.Size([240]) || stage1.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.828 |  0.685 |  0.123 | torch.Size([120, 240]) || stage1.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.031 | -0.297 |  0.419 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.984 |  0.163 |  1.398 |  0.202 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.weight
+ | -0.167 | -1.609 |  0.367 |  0.182 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm1.bias
+ | -0.343 | -4.484 |  2.362 |  0.486 | torch.Size([675, 6]) || stage1.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -1.586 |  1.649 |  0.151 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.240 |  0.056 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.378 |  0.514 |  0.086 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.attn.proj.weight
+ | -0.009 | -0.143 |  0.172 |  0.059 | torch.Size([120]) || stage1.residual_group1.blocks.2.attn.proj.bias
+ |  0.001 | -0.639 |  0.582 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.173 |  0.035 | torch.Size([360]) || stage1.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.733 |  0.277 |  0.903 |  0.081 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.weight
+ |  0.038 | -0.861 |  0.359 |  0.142 | torch.Size([120]) || stage1.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.787 |  0.679 |  0.131 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.029 | -0.365 |  0.143 |  0.076 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.574 |  0.539 |  0.120 | torch.Size([240, 120]) || stage1.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.283 |  0.254 |  0.097 | torch.Size([240]) || stage1.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.001 | -0.998 |  0.522 |  0.124 | torch.Size([120, 240]) || stage1.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.030 | -0.169 |  0.293 |  0.095 | torch.Size([120]) || stage1.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.035 |  0.143 |  1.397 |  0.196 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.weight
+ | -0.161 | -1.413 |  0.084 |  0.154 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm1.bias
+ | -0.441 | -4.685 |  3.306 |  0.529 | torch.Size([675, 6]) || stage1.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -1.590 |  1.329 |  0.155 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.266 |  0.232 |  0.049 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.366 |  0.372 |  0.084 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.attn.proj.weight
+ | -0.011 | -0.225 |  0.171 |  0.071 | torch.Size([120]) || stage1.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.660 |  0.801 |  0.100 | torch.Size([360, 120]) || stage1.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.139 |  0.200 |  0.031 | torch.Size([360]) || stage1.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.724 |  0.190 |  0.911 |  0.091 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.weight
+ |  0.038 | -0.981 |  0.285 |  0.137 | torch.Size([120]) || stage1.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.611 |  0.598 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.035 | -0.299 |  0.221 |  0.081 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.502 |  0.520 |  0.124 | torch.Size([240, 120]) || stage1.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.271 |  0.215 |  0.090 | torch.Size([240]) || stage1.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.558 |  0.898 |  0.127 | torch.Size([120, 240]) || stage1.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.010 | -0.424 |  0.190 |  0.082 | torch.Size([120]) || stage1.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.085 |  0.169 |  1.400 |  0.157 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.weight
+ | -0.086 | -1.613 |  0.150 |  0.160 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm1.bias
+ | -0.541 | -3.902 |  3.728 |  0.633 | torch.Size([675, 6]) || stage1.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.4.attn.position_bias
+ |  0.001 | -1.879 |  1.832 |  0.150 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.001 | -0.391 |  0.444 |  0.079 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.407 |  0.448 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.attn.proj.weight
+ | -0.013 | -0.302 |  0.342 |  0.104 | torch.Size([120]) || stage1.residual_group1.blocks.4.attn.proj.bias
+ | -0.001 | -0.830 |  0.863 |  0.102 | torch.Size([360, 120]) || stage1.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.117 |  0.094 |  0.024 | torch.Size([360]) || stage1.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.704 |  0.195 |  0.870 |  0.079 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.weight
+ |  0.031 | -1.069 |  0.276 |  0.140 | torch.Size([120]) || stage1.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.656 |  0.555 |  0.130 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.029 | -0.387 |  0.256 |  0.102 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.001 | -0.590 |  0.624 |  0.127 | torch.Size([240, 120]) || stage1.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.011 | -0.277 |  0.303 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -1.124 |  0.539 |  0.130 | torch.Size([120, 240]) || stage1.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.006 | -0.718 |  0.133 |  0.094 | torch.Size([120]) || stage1.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.037 |  0.176 |  1.327 |  0.158 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.weight
+ | -0.112 | -1.591 |  0.177 |  0.169 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm1.bias
+ | -0.438 | -2.229 |  2.797 |  0.523 | torch.Size([675, 6]) || stage1.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage1.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage1.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -2.212 |  1.826 |  0.153 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.001 | -0.343 |  0.338 |  0.068 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.367 |  0.451 |  0.087 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.attn.proj.weight
+ | -0.022 | -0.358 |  0.242 |  0.128 | torch.Size([120]) || stage1.residual_group1.blocks.5.attn.proj.bias
+ |  0.001 | -0.922 |  0.886 |  0.104 | torch.Size([360, 120]) || stage1.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.002 | -0.083 |  0.089 |  0.022 | torch.Size([360]) || stage1.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.662 |  0.277 |  0.831 |  0.066 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.weight
+ |  0.025 | -0.959 |  0.261 |  0.132 | torch.Size([120]) || stage1.residual_group1.blocks.5.norm2.bias
+ | -0.001 | -0.636 |  0.739 |  0.129 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.030 | -0.419 |  0.517 |  0.115 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.615 |  0.709 |  0.126 | torch.Size([240, 120]) || stage1.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.002 | -0.230 |  0.457 |  0.087 | torch.Size([240]) || stage1.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.001 | -1.724 |  1.186 |  0.132 | torch.Size([120, 240]) || stage1.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.019 | -1.909 |  0.255 |  0.190 | torch.Size([120]) || stage1.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.242 |  0.244 |  0.057 | torch.Size([120, 120]) || stage1.linear1.weight
+ |  0.004 | -0.221 |  0.224 |  0.083 | torch.Size([120]) || stage1.linear1.bias
+ |  0.737 |  0.334 |  1.046 |  0.119 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.weight
+ |  0.013 | -0.911 |  0.763 |  0.193 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm1.bias
+ | -0.052 | -2.462 |  2.040 |  0.273 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.785 |  0.767 |  0.123 | torch.Size([360, 120]) || stage1.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.009 | -0.466 |  0.552 |  0.122 | torch.Size([360]) || stage1.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.431 |  0.475 |  0.091 | torch.Size([120, 120]) || stage1.residual_group2.blocks.0.attn.proj.weight
+ | -0.009 | -0.796 |  0.497 |  0.109 | torch.Size([120]) || stage1.residual_group2.blocks.0.attn.proj.bias
+ |  0.573 |  0.409 |  0.935 |  0.096 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.weight
+ |  0.015 | -0.828 |  0.839 |  0.175 | torch.Size([120]) || stage1.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.604 |  0.542 |  0.109 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc11.weight
+ |  0.037 | -0.179 |  0.273 |  0.076 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.666 |  0.553 |  0.116 | torch.Size([240, 120]) || stage1.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.001 | -0.416 |  0.396 |  0.116 | torch.Size([240]) || stage1.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.654 |  0.538 |  0.118 | torch.Size([120, 240]) || stage1.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.002 | -0.470 |  0.310 |  0.122 | torch.Size([120]) || stage1.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.951 |  0.342 |  1.189 |  0.111 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.weight
+ |  0.010 | -0.697 |  0.802 |  0.166 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm1.bias
+ | -0.098 | -2.648 |  2.410 |  0.214 | torch.Size([2475, 6]) || stage1.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage1.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.733 |  0.886 |  0.139 | torch.Size([360, 120]) || stage1.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.468 |  0.550 |  0.132 | torch.Size([360]) || stage1.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.435 |  0.377 |  0.096 | torch.Size([120, 120]) || stage1.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.359 |  0.258 |  0.114 | torch.Size([120]) || stage1.residual_group2.blocks.1.attn.proj.bias
+ |  0.582 |  0.305 |  0.717 |  0.055 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.weight
+ |  0.008 | -0.714 |  0.833 |  0.131 | torch.Size([120]) || stage1.residual_group2.blocks.1.norm2.bias
+ |  0.001 | -0.732 |  0.501 |  0.118 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.004 | -0.306 |  0.267 |  0.091 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.510 |  0.533 |  0.126 | torch.Size([240, 120]) || stage1.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.315 |  0.291 |  0.090 | torch.Size([240]) || stage1.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.736 |  0.789 |  0.126 | torch.Size([120, 240]) || stage1.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.000 | -1.274 |  1.328 |  0.200 | torch.Size([120]) || stage1.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.390 |  0.303 |  0.069 | torch.Size([120, 120]) || stage1.linear2.weight
+ |  0.010 | -0.219 |  0.227 |  0.087 | torch.Size([120]) || stage1.linear2.bias
+ | -0.000 | -0.095 |  0.106 |  0.024 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.weight
+ | -0.001 | -0.036 |  0.036 |  0.013 | torch.Size([120]) || stage1.pa_deform.bias
+ | -0.000 | -0.136 |  0.141 |  0.017 | torch.Size([120, 242, 3, 3]) || stage1.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.028 |  0.024 |  0.013 | torch.Size([120]) || stage1.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.156 |  0.104 |  0.019 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.055 |  0.045 |  0.022 | torch.Size([120]) || stage1.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.098 |  0.106 |  0.018 | torch.Size([120, 120, 3, 3]) || stage1.pa_deform.conv_offset.4.weight
+ | -0.000 | -0.081 |  0.070 |  0.029 | torch.Size([120]) || stage1.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.375 |  0.279 |  0.027 | torch.Size([324, 120, 3, 3]) || stage1.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.074 |  0.070 |  0.028 | torch.Size([324]) || stage1.pa_deform.conv_offset.6.bias
+ | -0.000 | -0.776 |  0.733 |  0.114 | torch.Size([360, 360]) || stage1.pa_fuse.fc11.weight
+ |  0.021 | -0.239 |  0.513 |  0.121 | torch.Size([360]) || stage1.pa_fuse.fc11.bias
+ |  0.001 | -1.100 |  1.143 |  0.149 | torch.Size([360, 360]) || stage1.pa_fuse.fc12.weight
+ |  0.008 | -0.405 |  0.393 |  0.136 | torch.Size([360]) || stage1.pa_fuse.fc12.bias
+ |  0.000 | -0.963 |  0.899 |  0.142 | torch.Size([120, 360]) || stage1.pa_fuse.fc2.weight
+ | -0.055 | -0.616 |  0.599 |  0.197 | torch.Size([120]) || stage1.pa_fuse.fc2.bias
+ |  1.149 |  0.345 |  1.921 |  0.289 | torch.Size([480]) || stage2.reshape.1.weight
+ |  0.017 | -0.502 |  0.663 |  0.141 | torch.Size([480]) || stage2.reshape.1.bias
+ | -0.000 | -0.609 |  0.736 |  0.146 | torch.Size([120, 480]) || stage2.reshape.2.weight
+ |  0.006 | -0.136 |  0.404 |  0.077 | torch.Size([120]) || stage2.reshape.2.bias
+ |  0.686 |  0.172 |  1.113 |  0.175 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.weight
+ | -0.154 | -0.926 |  0.339 |  0.217 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm1.bias
+ | -0.120 | -1.869 |  4.616 |  0.310 | torch.Size([675, 6]) || stage2.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.514 |  0.499 |  0.102 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.214 |  0.177 |  0.044 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.499 |  0.529 |  0.093 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.attn.proj.weight
+ | -0.004 | -0.171 |  0.556 |  0.087 | torch.Size([120]) || stage2.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.642 |  0.598 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.141 |  0.125 |  0.027 | torch.Size([360]) || stage2.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.592 |  0.325 |  0.794 |  0.096 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.weight
+ |  0.008 | -0.649 |  0.445 |  0.168 | torch.Size([120]) || stage2.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.485 |  0.457 |  0.116 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.053 | -0.240 |  0.171 |  0.062 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.462 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.177 |  0.268 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.690 |  0.498 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.0.mlp.fc2.weight
+ | -0.007 | -0.270 |  0.472 |  0.097 | torch.Size([120]) || stage2.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.864 |  0.187 |  1.221 |  0.164 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.weight
+ | -0.146 | -1.128 |  0.299 |  0.204 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm1.bias
+ | -0.241 | -1.607 |  8.958 |  0.356 | torch.Size([675, 6]) || stage2.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.561 |  0.538 |  0.116 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.198 |  0.222 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.475 |  0.479 |  0.099 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.attn.proj.weight
+ | -0.006 | -0.295 |  0.341 |  0.101 | torch.Size([120]) || stage2.residual_group1.blocks.1.attn.proj.bias
+ |  0.001 | -0.961 |  0.789 |  0.080 | torch.Size([360, 120]) || stage2.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.105 |  0.143 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.653 |  0.401 |  0.810 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.weight
+ |  0.009 | -0.767 |  0.367 |  0.154 | torch.Size([120]) || stage2.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.486 |  0.499 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.056 | -0.185 |  0.147 |  0.058 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.548 |  0.121 | torch.Size([240, 120]) || stage2.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.231 |  0.177 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.578 |  0.609 |  0.123 | torch.Size([120, 240]) || stage2.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.350 |  0.216 |  0.098 | torch.Size([120]) || stage2.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.848 |  0.172 |  1.107 |  0.144 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.weight
+ | -0.168 | -1.123 |  0.330 |  0.178 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm1.bias
+ | -0.074 | -1.239 |  4.293 |  0.247 | torch.Size([675, 6]) || stage2.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.2.attn.position_bias
+ | -0.001 | -0.643 |  0.531 |  0.117 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.220 |  0.376 |  0.047 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.529 |  0.479 |  0.100 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.230 |  0.295 |  0.074 | torch.Size([120]) || stage2.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -0.726 |  0.768 |  0.091 | torch.Size([360, 120]) || stage2.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.001 | -0.167 |  0.193 |  0.028 | torch.Size([360]) || stage2.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.695 |  0.334 |  0.833 |  0.068 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.weight
+ |  0.012 | -0.755 |  0.517 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.474 |  0.480 |  0.119 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.049 | -0.218 |  0.148 |  0.067 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.542 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.006 | -0.245 |  0.239 |  0.073 | torch.Size([240]) || stage2.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.541 |  0.485 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.318 |  0.170 |  0.077 | torch.Size([120]) || stage2.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.903 |  0.178 |  1.124 |  0.124 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.weight
+ | -0.138 | -1.223 |  0.440 |  0.177 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm1.bias
+ | -0.164 | -1.383 |  5.910 |  0.305 | torch.Size([675, 6]) || stage2.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.526 |  0.496 |  0.120 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.250 |  0.273 |  0.061 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.447 |  0.524 |  0.097 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.243 |  0.256 |  0.082 | torch.Size([120]) || stage2.residual_group1.blocks.3.attn.proj.bias
+ | -0.001 | -0.551 |  0.730 |  0.083 | torch.Size([360, 120]) || stage2.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.001 | -0.145 |  0.126 |  0.024 | torch.Size([360]) || stage2.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.707 |  0.319 |  0.855 |  0.063 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.weight
+ |  0.013 | -0.839 |  0.507 |  0.155 | torch.Size([120]) || stage2.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.509 |  0.508 |  0.118 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.051 | -0.219 |  0.155 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.475 |  0.592 |  0.124 | torch.Size([240, 120]) || stage2.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.162 |  0.220 |  0.069 | torch.Size([240]) || stage2.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.465 |  0.528 |  0.124 | torch.Size([120, 240]) || stage2.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.243 |  0.286 |  0.088 | torch.Size([120]) || stage2.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.948 |  0.220 |  1.175 |  0.108 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.weight
+ | -0.125 | -1.093 |  0.385 |  0.157 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm1.bias
+ | -0.150 | -1.632 |  4.522 |  0.341 | torch.Size([675, 6]) || stage2.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.636 |  0.543 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.001 | -0.254 |  0.262 |  0.048 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.001 | -0.632 |  0.628 |  0.112 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.attn.proj.weight
+ | -0.005 | -0.240 |  0.330 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.476 |  0.479 |  0.088 | torch.Size([360, 120]) || stage2.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.001 | -0.112 |  0.134 |  0.020 | torch.Size([360]) || stage2.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.686 |  0.264 |  0.797 |  0.060 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.weight
+ |  0.012 | -0.889 |  0.427 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.476 |  0.478 |  0.117 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.051 | -0.267 |  0.180 |  0.071 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.506 |  0.517 |  0.127 | torch.Size([240, 120]) || stage2.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.172 |  0.241 |  0.068 | torch.Size([240]) || stage2.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.570 |  0.542 |  0.126 | torch.Size([120, 240]) || stage2.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.003 | -0.631 |  0.395 |  0.123 | torch.Size([120]) || stage2.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.912 |  0.189 |  1.122 |  0.104 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.weight
+ | -0.114 | -1.125 |  0.188 |  0.140 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm1.bias
+ | -0.099 | -1.285 |  1.708 |  0.236 | torch.Size([675, 6]) || stage2.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage2.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage2.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.496 |  0.540 |  0.119 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.003 | -0.260 |  0.228 |  0.052 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.511 |  0.454 |  0.095 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.711 |  0.286 |  0.115 | torch.Size([120]) || stage2.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.444 |  0.454 |  0.082 | torch.Size([360, 120]) || stage2.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.101 |  0.133 |  0.021 | torch.Size([360]) || stage2.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.668 |  0.312 |  0.800 |  0.056 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.weight
+ |  0.015 | -0.778 |  0.372 |  0.111 | torch.Size([120]) || stage2.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.485 |  0.469 |  0.115 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.045 | -0.294 |  0.173 |  0.083 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.554 |  0.540 |  0.129 | torch.Size([240, 120]) || stage2.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.001 | -0.183 |  0.199 |  0.077 | torch.Size([240]) || stage2.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.879 |  0.824 |  0.127 | torch.Size([120, 240]) || stage2.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.670 |  0.358 |  0.208 | torch.Size([120]) || stage2.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.001 | -0.253 |  0.346 |  0.068 | torch.Size([120, 120]) || stage2.linear1.weight
+ |  0.007 | -0.248 |  0.241 |  0.103 | torch.Size([120]) || stage2.linear1.bias
+ |  1.012 |  0.613 |  1.327 |  0.116 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.weight
+ |  0.019 | -0.724 |  0.685 |  0.244 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm1.bias
+ |  0.003 | -2.959 |  1.705 |  0.151 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.636 |  0.617 |  0.125 | torch.Size([360, 120]) || stage2.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.002 | -0.291 |  0.292 |  0.085 | torch.Size([360]) || stage2.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.002 | -0.476 |  0.512 |  0.138 | torch.Size([120, 120]) || stage2.residual_group2.blocks.0.attn.proj.weight
+ | -0.002 | -0.263 |  0.398 |  0.135 | torch.Size([120]) || stage2.residual_group2.blocks.0.attn.proj.bias
+ |  0.677 |  0.521 |  0.840 |  0.063 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.weight
+ |  0.010 | -0.710 |  0.541 |  0.173 | torch.Size([120]) || stage2.residual_group2.blocks.0.norm2.bias
+ |  0.001 | -0.540 |  0.507 |  0.112 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.016 | -0.242 |  0.201 |  0.077 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.519 |  0.479 |  0.122 | torch.Size([240, 120]) || stage2.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.162 |  0.231 |  0.071 | torch.Size([240]) || stage2.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.449 |  0.494 |  0.121 | torch.Size([120, 240]) || stage2.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.293 |  0.222 |  0.095 | torch.Size([120]) || stage2.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.053 |  0.832 |  1.269 |  0.079 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.weight
+ |  0.015 | -0.549 |  0.428 |  0.189 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm1.bias
+ |  0.007 | -3.099 |  1.550 |  0.170 | torch.Size([2475, 6]) || stage2.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage2.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.673 |  0.604 |  0.131 | torch.Size([360, 120]) || stage2.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.416 |  0.391 |  0.089 | torch.Size([360]) || stage2.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.569 |  0.560 |  0.139 | torch.Size([120, 120]) || stage2.residual_group2.blocks.1.attn.proj.weight
+ |  0.004 | -0.613 |  0.428 |  0.158 | torch.Size([120]) || stage2.residual_group2.blocks.1.attn.proj.bias
+ |  0.762 |  0.464 |  0.954 |  0.085 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.weight
+ |  0.005 | -0.745 |  0.381 |  0.117 | torch.Size([120]) || stage2.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.441 |  0.448 |  0.110 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc11.weight
+ |  0.019 | -0.292 |  0.460 |  0.117 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.491 |  0.490 |  0.126 | torch.Size([240, 120]) || stage2.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.007 | -0.285 |  0.177 |  0.068 | torch.Size([240]) || stage2.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.535 |  0.631 |  0.125 | torch.Size([120, 240]) || stage2.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.011 | -0.765 |  0.337 |  0.142 | torch.Size([120]) || stage2.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.001 | -0.367 |  0.372 |  0.074 | torch.Size([120, 120]) || stage2.linear2.weight
+ |  0.009 | -0.288 |  0.342 |  0.130 | torch.Size([120]) || stage2.linear2.bias
+ |  0.000 | -0.112 |  0.093 |  0.022 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.weight
+ | -0.002 | -0.036 |  0.035 |  0.016 | torch.Size([120]) || stage2.pa_deform.bias
+ |  0.000 | -0.068 |  0.080 |  0.016 | torch.Size([120, 242, 3, 3]) || stage2.pa_deform.conv_offset.0.weight
+ | -0.009 | -0.035 |  0.023 |  0.013 | torch.Size([120]) || stage2.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.068 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.2.weight
+ | -0.014 | -0.061 |  0.036 |  0.021 | torch.Size([120]) || stage2.pa_deform.conv_offset.2.bias
+ | -0.001 | -0.082 |  0.079 |  0.019 | torch.Size([120, 120, 3, 3]) || stage2.pa_deform.conv_offset.4.weight
+ | -0.003 | -0.075 |  0.069 |  0.035 | torch.Size([120]) || stage2.pa_deform.conv_offset.4.bias
+ | -0.000 | -0.166 |  0.139 |  0.016 | torch.Size([324, 120, 3, 3]) || stage2.pa_deform.conv_offset.6.weight
+ | -0.015 | -0.090 |  0.050 |  0.030 | torch.Size([324]) || stage2.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.642 |  0.663 |  0.127 | torch.Size([360, 360]) || stage2.pa_fuse.fc11.weight
+ |  0.130 | -0.171 |  0.480 |  0.140 | torch.Size([360]) || stage2.pa_fuse.fc11.bias
+ | -0.000 | -0.696 |  0.620 |  0.118 | torch.Size([360, 360]) || stage2.pa_fuse.fc12.weight
+ | -0.007 | -0.337 |  0.301 |  0.102 | torch.Size([360]) || stage2.pa_fuse.fc12.bias
+ |  0.000 | -0.650 |  0.657 |  0.128 | torch.Size([120, 360]) || stage2.pa_fuse.fc2.weight
+ |  0.013 | -0.507 |  0.451 |  0.215 | torch.Size([120]) || stage2.pa_fuse.fc2.bias
+ |  1.067 |  0.372 |  1.778 |  0.269 | torch.Size([480]) || stage3.reshape.1.weight
+ | -0.004 | -0.699 |  0.521 |  0.227 | torch.Size([480]) || stage3.reshape.1.bias
+ | -0.000 | -0.643 |  0.743 |  0.138 | torch.Size([120, 480]) || stage3.reshape.2.weight
+ |  0.009 | -0.176 |  0.243 |  0.079 | torch.Size([120]) || stage3.reshape.2.bias
+ |  0.785 |  0.469 |  1.029 |  0.105 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.weight
+ | -0.102 | -0.716 |  0.311 |  0.179 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm1.bias
+ | -0.001 | -0.340 |  0.163 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.328 |  0.302 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.004 | -0.232 |  0.189 |  0.063 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.343 |  0.346 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.attn.proj.weight
+ |  0.004 | -0.335 |  0.229 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.366 |  0.325 |  0.052 | torch.Size([360, 120]) || stage3.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.001 | -0.091 |  0.074 |  0.017 | torch.Size([360]) || stage3.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.751 |  0.517 |  0.928 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.weight
+ |  0.002 | -0.271 |  0.189 |  0.101 | torch.Size([120]) || stage3.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.371 |  0.388 |  0.096 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.073 | -0.203 |  0.039 |  0.046 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.400 |  0.401 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.178 |  0.128 |  0.052 | torch.Size([240]) || stage3.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.001 | -0.410 |  0.429 |  0.098 | torch.Size([120, 240]) || stage3.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.006 | -0.345 |  0.304 |  0.108 | torch.Size([120]) || stage3.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.816 |  0.469 |  1.015 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.weight
+ | -0.103 | -0.647 |  0.225 |  0.140 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm1.bias
+ |  0.001 | -0.464 |  0.239 |  0.034 | torch.Size([675, 6]) || stage3.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.304 |  0.359 |  0.061 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.173 |  0.193 |  0.047 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.299 |  0.408 |  0.055 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.attn.proj.weight
+ |  0.007 | -0.511 |  0.239 |  0.113 | torch.Size([120]) || stage3.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.288 |  0.254 |  0.049 | torch.Size([360, 120]) || stage3.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.001 | -0.060 |  0.054 |  0.016 | torch.Size([360]) || stage3.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.796 |  0.609 |  0.971 |  0.076 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.weight
+ | -0.002 | -0.327 |  0.247 |  0.122 | torch.Size([120]) || stage3.residual_group1.blocks.1.norm2.bias
+ |  0.001 | -0.379 |  0.407 |  0.094 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.214 |  0.034 |  0.045 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.391 |  0.432 |  0.092 | torch.Size([240, 120]) || stage3.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.176 |  0.112 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.378 |  0.399 |  0.093 | torch.Size([120, 240]) || stage3.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.009 | -0.410 |  0.306 |  0.110 | torch.Size([120]) || stage3.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.854 |  0.447 |  0.995 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.weight
+ | -0.086 | -0.513 |  0.198 |  0.116 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm1.bias
+ | -0.001 | -0.189 |  0.292 |  0.033 | torch.Size([675, 6]) || stage3.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.390 |  0.367 |  0.067 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.002 | -0.310 |  0.284 |  0.078 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.334 |  0.296 |  0.061 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.attn.proj.weight
+ |  0.004 | -0.356 |  0.299 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.2.attn.proj.bias
+ |  0.000 | -0.276 |  0.315 |  0.055 | torch.Size([360, 120]) || stage3.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.094 |  0.066 |  0.014 | torch.Size([360]) || stage3.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.829 |  0.673 |  1.017 |  0.074 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.weight
+ |  0.003 | -0.259 |  0.228 |  0.098 | torch.Size([120]) || stage3.residual_group1.blocks.2.norm2.bias
+ |  0.001 | -0.410 |  0.385 |  0.091 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.085 | -0.200 |  0.017 |  0.044 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.348 |  0.378 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.130 |  0.105 |  0.042 | torch.Size([240]) || stage3.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.346 |  0.425 |  0.090 | torch.Size([120, 240]) || stage3.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.005 | -0.363 |  0.241 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.872 |  0.554 |  1.068 |  0.102 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.weight
+ | -0.057 | -0.402 |  0.133 |  0.087 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm1.bias
+ |  0.003 | -0.365 |  0.217 |  0.050 | torch.Size([675, 6]) || stage3.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.359 |  0.357 |  0.065 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.265 |  0.294 |  0.062 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.300 |  0.271 |  0.054 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.attn.proj.weight
+ |  0.002 | -0.316 |  0.215 |  0.094 | torch.Size([120]) || stage3.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.370 |  0.329 |  0.039 | torch.Size([360, 120]) || stage3.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.056 |  0.066 |  0.013 | torch.Size([360]) || stage3.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.842 |  0.631 |  0.989 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.weight
+ | -0.001 | -0.216 |  0.263 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.388 |  0.391 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.087 | -0.202 |  0.032 |  0.048 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.364 |  0.428 |  0.088 | torch.Size([240, 120]) || stage3.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.137 |  0.106 |  0.043 | torch.Size([240]) || stage3.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.001 | -0.390 |  0.339 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.376 |  0.203 |  0.090 | torch.Size([120]) || stage3.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.913 |  0.498 |  1.102 |  0.096 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.weight
+ | -0.048 | -0.340 |  0.105 |  0.071 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm1.bias
+ |  0.001 | -0.706 |  0.306 |  0.058 | torch.Size([675, 6]) || stage3.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.4.attn.position_bias
+ |  0.000 | -0.373 |  0.339 |  0.076 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.004 | -0.301 |  0.301 |  0.074 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.278 |  0.277 |  0.058 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.attn.proj.weight
+ |  0.003 | -0.310 |  0.240 |  0.079 | torch.Size([120]) || stage3.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.350 |  0.322 |  0.046 | torch.Size([360, 120]) || stage3.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.064 |  0.010 | torch.Size([360]) || stage3.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.862 |  0.679 |  0.990 |  0.059 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.weight
+ | -0.004 | -0.313 |  0.190 |  0.083 | torch.Size([120]) || stage3.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.370 |  0.364 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.092 | -0.231 |  0.129 |  0.057 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.375 |  0.511 |  0.090 | torch.Size([240, 120]) || stage3.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.114 |  0.114 |  0.040 | torch.Size([240]) || stage3.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.389 |  0.354 |  0.088 | torch.Size([120, 240]) || stage3.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.005 | -0.258 |  0.164 |  0.073 | torch.Size([120]) || stage3.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.899 |  0.480 |  1.089 |  0.103 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.weight
+ | -0.030 | -0.257 |  0.115 |  0.056 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm1.bias
+ |  0.003 | -0.462 |  0.290 |  0.069 | torch.Size([675, 6]) || stage3.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage3.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage3.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.391 |  0.365 |  0.069 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.004 | -0.232 |  0.302 |  0.064 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.267 |  0.293 |  0.051 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.250 |  0.182 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.238 |  0.257 |  0.033 | torch.Size([360, 120]) || stage3.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.001 | -0.032 |  0.033 |  0.008 | torch.Size([360]) || stage3.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.864 |  0.651 |  1.029 |  0.070 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.weight
+ | -0.003 | -0.212 |  0.175 |  0.075 | torch.Size([120]) || stage3.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.378 |  0.379 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.097 | -0.308 |  0.026 |  0.051 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.578 |  0.401 |  0.089 | torch.Size([240, 120]) || stage3.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.005 | -0.166 |  0.131 |  0.049 | torch.Size([240]) || stage3.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.358 |  0.376 |  0.085 | torch.Size([120, 240]) || stage3.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -0.262 |  0.176 |  0.072 | torch.Size([120]) || stage3.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.003 | -0.284 |  0.467 |  0.071 | torch.Size([120, 120]) || stage3.linear1.weight
+ |  0.006 | -0.201 |  0.269 |  0.090 | torch.Size([120]) || stage3.linear1.bias
+ |  0.877 |  0.568 |  1.197 |  0.115 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.weight
+ |  0.002 | -0.248 |  0.324 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.261 |  0.125 |  0.029 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.563 |  0.552 |  0.074 | torch.Size([360, 120]) || stage3.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.005 | -0.257 |  0.302 |  0.081 | torch.Size([360]) || stage3.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.390 |  0.385 |  0.084 | torch.Size([120, 120]) || stage3.residual_group2.blocks.0.attn.proj.weight
+ |  0.002 | -0.450 |  0.235 |  0.125 | torch.Size([120]) || stage3.residual_group2.blocks.0.attn.proj.bias
+ |  0.986 |  0.755 |  1.165 |  0.078 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.260 |  0.169 |  0.076 | torch.Size([120]) || stage3.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.355 |  0.397 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.046 | -0.220 |  0.086 |  0.055 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.424 |  0.368 |  0.089 | torch.Size([240, 120]) || stage3.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.006 | -0.111 |  0.122 |  0.038 | torch.Size([240]) || stage3.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.354 |  0.374 |  0.090 | torch.Size([120, 240]) || stage3.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.001 | -0.374 |  0.272 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.919 |  0.643 |  1.132 |  0.100 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.177 |  0.181 |  0.063 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.332 |  0.131 |  0.028 | torch.Size([2475, 6]) || stage3.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage3.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.418 |  0.362 |  0.069 | torch.Size([360, 120]) || stage3.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.004 | -0.375 |  0.347 |  0.082 | torch.Size([360]) || stage3.residual_group2.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.294 |  0.354 |  0.077 | torch.Size([120, 120]) || stage3.residual_group2.blocks.1.attn.proj.weight
+ |  0.003 | -0.432 |  0.259 |  0.101 | torch.Size([120]) || stage3.residual_group2.blocks.1.attn.proj.bias
+ |  1.012 |  0.750 |  1.178 |  0.077 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.weight
+ | -0.001 | -0.171 |  0.155 |  0.060 | torch.Size([120]) || stage3.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.331 |  0.356 |  0.087 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.035 | -0.207 |  0.197 |  0.065 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.399 |  0.398 |  0.092 | torch.Size([240, 120]) || stage3.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.111 |  0.129 |  0.041 | torch.Size([240]) || stage3.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.001 | -0.353 |  0.330 |  0.088 | torch.Size([120, 240]) || stage3.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.328 |  0.127 |  0.064 | torch.Size([120]) || stage3.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.003 | -0.289 |  0.519 |  0.073 | torch.Size([120, 120]) || stage3.linear2.weight
+ |  0.002 | -0.318 |  0.371 |  0.144 | torch.Size([120]) || stage3.linear2.bias
+ | -0.000 | -0.086 |  0.095 |  0.022 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.weight
+ | -0.002 | -0.023 |  0.021 |  0.010 | torch.Size([120]) || stage3.pa_deform.bias
+ | -0.000 | -0.060 |  0.056 |  0.015 | torch.Size([120, 242, 3, 3]) || stage3.pa_deform.conv_offset.0.weight
+ | -0.008 | -0.035 |  0.019 |  0.013 | torch.Size([120]) || stage3.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.064 |  0.062 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.2.weight
+ | -0.007 | -0.044 |  0.031 |  0.019 | torch.Size([120]) || stage3.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.062 |  0.063 |  0.019 | torch.Size([120, 120, 3, 3]) || stage3.pa_deform.conv_offset.4.weight
+ | -0.006 | -0.052 |  0.043 |  0.021 | torch.Size([120]) || stage3.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.081 |  0.080 |  0.011 | torch.Size([324, 120, 3, 3]) || stage3.pa_deform.conv_offset.6.weight
+ | -0.004 | -0.087 |  0.083 |  0.021 | torch.Size([324]) || stage3.pa_deform.conv_offset.6.bias
+ | -0.002 | -0.465 |  0.513 |  0.101 | torch.Size([360, 360]) || stage3.pa_fuse.fc11.weight
+ |  0.059 | -0.251 |  0.595 |  0.104 | torch.Size([360]) || stage3.pa_fuse.fc11.bias
+ | -0.000 | -0.544 |  0.531 |  0.100 | torch.Size([360, 360]) || stage3.pa_fuse.fc12.weight
+ |  0.001 | -0.589 |  0.433 |  0.106 | torch.Size([360]) || stage3.pa_fuse.fc12.bias
+ | -0.000 | -0.535 |  0.562 |  0.127 | torch.Size([120, 360]) || stage3.pa_fuse.fc2.weight
+ | -0.001 | -0.401 |  0.342 |  0.121 | torch.Size([120]) || stage3.pa_fuse.fc2.bias
+ |  0.997 |  0.921 |  1.125 |  0.028 | torch.Size([480]) || stage4.reshape.1.weight
+ | -0.000 | -0.058 |  0.059 |  0.022 | torch.Size([480]) || stage4.reshape.1.bias
+ |  0.000 | -0.155 |  0.150 |  0.031 | torch.Size([120, 480]) || stage4.reshape.2.weight
+ |  0.001 | -0.016 |  0.016 |  0.006 | torch.Size([120]) || stage4.reshape.2.bias
+ |  1.002 |  0.999 |  1.009 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.weight
+ |  0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.071 |  0.066 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.093 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.009 |  0.009 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.097 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.035 |  0.027 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.080 |  0.079 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.087 |  0.092 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.080 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.031 |  0.029 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.002 |  0.997 |  1.007 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.066 |  0.065 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.080 |  0.083 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.attn.proj.weight
+ | -0.000 | -0.027 |  0.029 |  0.012 | torch.Size([120]) || stage4.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.077 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.006 |  0.009 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.080 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.084 |  0.075 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.034 |  0.031 |  0.013 | torch.Size([120]) || stage4.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.996 |  1.008 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm1.bias
+ |  0.001 | -0.070 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.091 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.007 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.080 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.attn.proj.weight
+ | -0.000 | -0.023 |  0.026 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.107 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.000 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.076 |  0.077 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.005 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -2.000 |  0.081 |  0.023 | torch.Size([240, 120]) || stage4.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.084 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.000 | -0.027 |  0.024 |  0.010 | torch.Size([120]) || stage4.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.999 |  1.012 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.weight
+ | -0.000 | -0.003 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.064 |  0.071 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.099 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.083 |  0.084 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.attn.proj.weight
+ | -0.000 | -0.019 |  0.018 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.079 |  0.084 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.078 |  0.081 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.087 |  0.076 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.3.mlp.fc12.weight
+ | -0.000 | -0.001 |  0.002 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.079 |  0.082 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.011 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.weight
+ | -0.001 | -0.004 |  0.003 |  0.001 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm1.bias
+ |  0.000 | -0.089 |  0.081 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.080 |  0.085 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.006 |  0.005 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.077 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.attn.proj.weight
+ | -0.000 | -0.021 |  0.016 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.082 |  0.088 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.004 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.086 |  0.080 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc11.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.084 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.081 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.000 | -0.018 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.003 |  0.997 |  1.014 |  0.003 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.weight
+ | -0.001 | -0.005 |  0.004 |  0.002 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm1.bias
+ | -0.001 | -0.070 |  0.069 |  0.020 | torch.Size([675, 6]) || stage4.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage4.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage4.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.097 |  0.082 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.075 |  0.089 |  0.021 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.016 |  0.015 |  0.007 | torch.Size([120]) || stage4.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.083 |  0.091 |  0.020 | torch.Size([360, 120]) || stage4.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  1.000 |  0.999 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.093 |  0.083 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc11.weight
+ |  0.000 | -0.002 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc11.bias
+ |  0.000 | -0.086 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.079 |  0.092 |  0.020 | torch.Size([120, 240]) || stage4.residual_group1.blocks.5.mlp.fc2.weight
+ | -0.000 | -0.012 |  0.016 |  0.005 | torch.Size([120]) || stage4.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.090 |  0.111 |  0.024 | torch.Size([120, 120]) || stage4.linear1.weight
+ |  0.001 | -0.019 |  0.029 |  0.009 | torch.Size([120]) || stage4.linear1.bias
+ |  1.000 |  0.999 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.078 |  0.075 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.084 |  0.087 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.079 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.0.attn.proj.weight
+ |  0.000 | -0.021 |  0.024 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.0.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.079 |  0.072 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.077 |  0.078 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.102 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.024 |  0.020 |  0.009 | torch.Size([120]) || stage4.residual_group2.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.998 |  1.003 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.weight
+ | -0.000 | -0.002 |  0.002 |  0.001 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.071 |  0.079 |  0.020 | torch.Size([2475, 6]) || stage4.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage4.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.078 |  0.096 |  0.020 | torch.Size([360, 120]) || stage4.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.005 |  0.006 |  0.001 | torch.Size([360]) || stage4.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.080 |  0.020 | torch.Size([120, 120]) || stage4.residual_group2.blocks.1.attn.proj.weight
+ |  0.000 | -0.020 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.attn.proj.bias
+ |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.weight
+ | -0.000 | -0.000 |  0.000 |  0.000 | torch.Size([120]) || stage4.residual_group2.blocks.1.norm2.bias
+ | -0.000 | -0.085 |  0.082 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.000 | -0.001 |  0.001 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([240, 120]) || stage4.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.001 |  0.000 |  0.000 | torch.Size([240]) || stage4.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.078 |  0.078 |  0.020 | torch.Size([120, 240]) || stage4.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.021 |  0.008 | torch.Size([120]) || stage4.residual_group2.blocks.1.mlp.fc2.bias
+ |  0.000 | -0.092 |  0.112 |  0.023 | torch.Size([120, 120]) || stage4.linear2.weight
+ |  0.000 | -0.032 |  0.049 |  0.015 | torch.Size([120]) || stage4.linear2.bias
+ |  0.000 | -0.036 |  0.037 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage4.pa_deform.bias
+ | -0.000 | -0.021 |  0.022 |  0.012 | torch.Size([120, 242, 3, 3]) || stage4.pa_deform.conv_offset.0.weight
+ | -0.001 | -0.021 |  0.021 |  0.012 | torch.Size([120]) || stage4.pa_deform.conv_offset.0.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.2.weight
+ |  0.002 | -0.030 |  0.030 |  0.018 | torch.Size([120]) || stage4.pa_deform.conv_offset.2.bias
+ |  0.000 | -0.030 |  0.030 |  0.018 | torch.Size([120, 120, 3, 3]) || stage4.pa_deform.conv_offset.4.weight
+ | -0.002 | -0.030 |  0.030 |  0.017 | torch.Size([120]) || stage4.pa_deform.conv_offset.4.bias
+ |  0.000 | -0.003 |  0.002 |  0.000 | torch.Size([324, 120, 3, 3]) || stage4.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.005 |  0.004 |  0.001 | torch.Size([324]) || stage4.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.172 |  0.177 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc11.weight
+ |  0.002 | -0.027 |  0.088 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc11.bias
+ |  0.000 | -0.212 |  0.163 |  0.022 | torch.Size([360, 360]) || stage4.pa_fuse.fc12.weight
+ |  0.000 | -0.066 |  0.081 |  0.014 | torch.Size([360]) || stage4.pa_fuse.fc12.bias
+ |  0.000 | -0.413 |  0.387 |  0.029 | torch.Size([120, 360]) || stage4.pa_fuse.fc2.weight
+ | -0.001 | -0.198 |  0.214 |  0.073 | torch.Size([120]) || stage4.pa_fuse.fc2.bias
+ |  0.979 |  0.896 |  1.076 |  0.053 | torch.Size([30]) || stage5.reshape.1.weight
+ | -0.005 | -0.074 |  0.100 |  0.043 | torch.Size([30]) || stage5.reshape.1.bias
+ |  0.000 | -0.240 |  0.249 |  0.058 | torch.Size([120, 30]) || stage5.reshape.2.weight
+ | -0.002 | -0.286 |  0.229 |  0.080 | torch.Size([120]) || stage5.reshape.2.bias
+ |  1.001 |  0.993 |  1.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.weight
+ | -0.004 | -0.018 |  0.006 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.066 |  0.062 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.091 |  0.086 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.012 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.166 |  0.172 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.053 |  0.045 |  0.018 | torch.Size([120]) || stage5.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.090 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.999 |  0.987 |  1.001 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.094 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc11.weight
+ |  0.000 | -0.022 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.082 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.014 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.075 |  0.083 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.073 |  0.078 |  0.021 | torch.Size([120]) || stage5.residual_group1.blocks.0.mlp.fc2.bias
+ |  1.001 |  0.994 |  1.007 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.weight
+ | -0.004 | -0.016 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.065 |  0.063 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.1.attn.position_bias
+ | -0.000 | -0.077 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.022 |  0.017 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.113 |  0.098 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.058 |  0.045 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.080 |  0.080 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.000 | -0.008 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.999 |  0.982 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.weight
+ |  0.000 | -0.006 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.1.norm2.bias
+ | -0.000 | -0.076 |  0.083 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc11.weight
+ |  0.000 | -0.017 |  0.014 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.080 |  0.086 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.014 |  0.016 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.096 |  0.079 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.001 | -0.051 |  0.039 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.1.mlp.fc2.bias
+ |  1.002 |  0.998 |  1.009 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.weight
+ | -0.004 | -0.014 |  0.003 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm1.bias
+ |  0.000 | -0.067 |  0.073 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.085 |  0.087 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.015 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.108 |  0.095 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.attn.proj.weight
+ | -0.001 | -0.043 |  0.039 |  0.013 | torch.Size([120]) || stage5.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.088 |  0.081 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.009 |  0.007 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.999 |  0.978 |  1.001 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.weight
+ |  0.000 | -0.003 |  0.004 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.076 |  0.081 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.000 | -0.012 |  0.019 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.079 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.001 | -0.014 |  0.012 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.076 |  0.082 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.2.mlp.fc2.weight
+ | -0.000 | -0.047 |  0.043 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.2.mlp.fc2.bias
+ |  1.002 |  0.978 |  1.015 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.weight
+ | -0.004 | -0.013 |  0.004 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm1.bias
+ | -0.000 | -0.084 |  0.070 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.078 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.014 |  0.014 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.000 | -0.123 |  0.132 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.attn.proj.weight
+ |  0.001 | -0.028 |  0.044 |  0.015 | torch.Size([120]) || stage5.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -0.082 |  0.089 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.3.attn.qkv_mut.weight
+ | -0.000 | -0.007 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.999 |  0.974 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.weight
+ |  0.000 | -0.008 |  0.010 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.3.norm2.bias
+ |  0.000 | -0.075 |  0.088 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc11.weight
+ |  0.000 | -0.014 |  0.019 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.031 |  0.020 |  0.006 | torch.Size([240]) || stage5.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.106 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.3.mlp.fc2.weight
+ | -0.002 | -0.046 |  0.042 |  0.017 | torch.Size([120]) || stage5.residual_group1.blocks.3.mlp.fc2.bias
+ |  1.003 |  0.944 |  1.017 |  0.009 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.weight
+ | -0.005 | -0.015 |  0.004 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.071 |  0.067 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.085 |  0.090 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.021 |  0.013 |  0.004 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_self.bias
+ |  0.000 | -0.130 |  0.089 |  0.021 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.attn.proj.weight
+ | -0.001 | -0.036 |  0.024 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -0.086 |  0.076 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.008 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.999 |  0.967 |  1.001 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.weight
+ |  0.000 | -0.006 |  0.007 |  0.003 | torch.Size([120]) || stage5.residual_group1.blocks.4.norm2.bias
+ |  0.000 | -0.080 |  0.085 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.001 | -0.015 |  0.010 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.077 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.000 | -0.020 |  0.018 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.4.mlp.fc12.bias
+ |  0.000 | -0.081 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.4.mlp.fc2.weight
+ | -0.001 | -0.037 |  0.050 |  0.014 | torch.Size([120]) || stage5.residual_group1.blocks.4.mlp.fc2.bias
+ |  1.004 |  0.976 |  1.039 |  0.008 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.weight
+ | -0.005 | -0.015 |  0.005 |  0.004 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.070 |  0.076 |  0.020 | torch.Size([675, 6]) || stage5.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage5.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage5.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.099 |  0.097 |  0.021 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.000 | -0.011 |  0.012 |  0.003 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_self.bias
+ | -0.000 | -0.084 |  0.093 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.attn.proj.weight
+ |  0.000 | -0.038 |  0.035 |  0.012 | torch.Size([120]) || stage5.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.087 |  0.082 |  0.020 | torch.Size([360, 120]) || stage5.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.008 |  0.010 |  0.002 | torch.Size([360]) || stage5.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.998 |  0.960 |  1.002 |  0.005 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.weight
+ |  0.000 | -0.006 |  0.006 |  0.002 | torch.Size([120]) || stage5.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.088 |  0.095 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.000 | -0.014 |  0.027 |  0.005 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.081 |  0.074 |  0.020 | torch.Size([240, 120]) || stage5.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.025 |  0.004 | torch.Size([240]) || stage5.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -0.100 |  0.086 |  0.020 | torch.Size([120, 240]) || stage5.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.000 | -0.022 |  0.030 |  0.011 | torch.Size([120]) || stage5.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.102 |  0.117 |  0.023 | torch.Size([120, 120]) || stage5.linear1.weight
+ | -0.003 | -0.297 |  0.242 |  0.084 | torch.Size([120]) || stage5.linear1.bias
+ |  0.999 |  0.971 |  1.008 |  0.005 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.weight
+ | -0.000 | -0.035 |  0.034 |  0.011 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm1.bias
+ |  0.000 | -0.079 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.087 |  0.083 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.028 |  0.018 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.079 |  0.082 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.0.attn.proj.weight
+ | -0.001 | -0.146 |  0.171 |  0.054 | torch.Size([120]) || stage5.residual_group2.blocks.0.attn.proj.bias
+ |  0.997 |  0.967 |  1.003 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.weight
+ |  0.000 | -0.005 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.0.norm2.bias
+ | -0.000 | -0.073 |  0.089 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.002 | -0.017 |  0.008 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.084 |  0.073 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.013 |  0.011 |  0.003 | torch.Size([240]) || stage5.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.083 |  0.085 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.103 |  0.140 |  0.037 | torch.Size([120]) || stage5.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.999 |  0.986 |  1.010 |  0.004 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.weight
+ |  0.000 | -0.035 |  0.034 |  0.010 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm1.bias
+ |  0.000 | -0.087 |  0.074 |  0.020 | torch.Size([2475, 6]) || stage5.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage5.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.084 |  0.079 |  0.020 | torch.Size([360, 120]) || stage5.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.024 |  0.024 |  0.005 | torch.Size([360]) || stage5.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.077 |  0.078 |  0.021 | torch.Size([120, 120]) || stage5.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.112 |  0.144 |  0.038 | torch.Size([120]) || stage5.residual_group2.blocks.1.attn.proj.bias
+ |  0.998 |  0.965 |  1.004 |  0.006 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.weight
+ |  0.000 | -0.004 |  0.005 |  0.002 | torch.Size([120]) || stage5.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.088 |  0.079 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.001 | -0.012 |  0.015 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.102 |  0.080 |  0.020 | torch.Size([240, 120]) || stage5.residual_group2.blocks.1.mlp.fc12.weight
+ |  0.000 | -0.012 |  0.009 |  0.004 | torch.Size([240]) || stage5.residual_group2.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.075 |  0.078 |  0.020 | torch.Size([120, 240]) || stage5.residual_group2.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.105 |  0.131 |  0.042 | torch.Size([120]) || stage5.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.220 |  0.209 |  0.035 | torch.Size([120, 120]) || stage5.linear2.weight
+ | -0.003 | -0.335 |  0.284 |  0.096 | torch.Size([120]) || stage5.linear2.bias
+ | -0.000 | -0.064 |  0.065 |  0.019 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.weight
+ |  0.001 | -0.050 |  0.050 |  0.029 | torch.Size([120]) || stage5.pa_deform.bias
+ |  0.000 | -0.119 |  0.106 |  0.013 | torch.Size([120, 242, 3, 3]) || stage5.pa_deform.conv_offset.0.weight
+ | -0.006 | -0.030 |  0.026 |  0.014 | torch.Size([120]) || stage5.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.055 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.2.weight
+ |  0.001 | -0.033 |  0.031 |  0.018 | torch.Size([120]) || stage5.pa_deform.conv_offset.2.bias
+ |  0.001 | -0.060 |  0.050 |  0.018 | torch.Size([120, 120, 3, 3]) || stage5.pa_deform.conv_offset.4.weight
+ | -0.005 | -0.040 |  0.037 |  0.019 | torch.Size([120]) || stage5.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.038 |  0.051 |  0.006 | torch.Size([324, 120, 3, 3]) || stage5.pa_deform.conv_offset.6.weight
+ |  0.000 | -0.048 |  0.050 |  0.017 | torch.Size([324]) || stage5.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.334 |  0.340 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc11.weight
+ |  0.037 | -0.050 |  0.294 |  0.064 | torch.Size([360]) || stage5.pa_fuse.fc11.bias
+ | -0.000 | -0.343 |  0.349 |  0.036 | torch.Size([360, 360]) || stage5.pa_fuse.fc12.weight
+ | -0.001 | -0.237 |  0.244 |  0.049 | torch.Size([360]) || stage5.pa_fuse.fc12.bias
+ | -0.000 | -0.575 |  0.591 |  0.060 | torch.Size([120, 360]) || stage5.pa_fuse.fc2.weight
+ | -0.001 | -0.404 |  0.344 |  0.122 | torch.Size([120]) || stage5.pa_fuse.fc2.bias
+ |  1.254 |  1.058 |  1.466 |  0.126 | torch.Size([30]) || stage6.reshape.1.weight
+ | -0.001 | -0.074 |  0.093 |  0.041 | torch.Size([30]) || stage6.reshape.1.bias
+ |  0.000 | -0.734 |  0.625 |  0.177 | torch.Size([120, 30]) || stage6.reshape.2.weight
+ |  0.003 | -0.269 |  0.341 |  0.108 | torch.Size([120]) || stage6.reshape.2.bias
+ |  0.815 |  0.495 |  1.118 |  0.121 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.weight
+ | -0.071 | -0.291 |  0.263 |  0.101 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm1.bias
+ | -0.000 | -0.080 |  0.087 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.0.attn.position_bias
+ |  0.000 | -0.136 |  0.134 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.061 |  0.037 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.201 |  0.182 |  0.032 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.attn.proj.weight
+ |  0.000 | -0.223 |  0.189 |  0.090 | torch.Size([120]) || stage6.residual_group1.blocks.0.attn.proj.bias
+ |  0.000 | -0.184 |  0.211 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.0.attn.qkv_mut.weight
+ |  0.000 | -0.049 |  0.069 |  0.011 | torch.Size([360]) || stage6.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.710 |  0.556 |  0.893 |  0.072 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.weight
+ | -0.003 | -0.172 |  0.193 |  0.070 | torch.Size([120]) || stage6.residual_group1.blocks.0.norm2.bias
+ |  0.000 | -0.217 |  0.211 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.041 | -0.158 |  0.025 |  0.036 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.209 |  0.178 |  0.031 | torch.Size([240, 120]) || stage6.residual_group1.blocks.0.mlp.fc12.weight
+ | -0.000 | -0.141 |  0.186 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.245 |  0.347 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.005 | -0.161 |  0.188 |  0.079 | torch.Size([120]) || stage6.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.780 |  0.582 |  0.963 |  0.088 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.weight
+ | -0.112 | -0.302 |  0.103 |  0.085 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm1.bias
+ |  0.000 | -0.101 |  0.072 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.112 |  0.178 |  0.026 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.000 | -0.034 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.223 |  0.242 |  0.033 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.attn.proj.weight
+ | -0.003 | -0.149 |  0.105 |  0.047 | torch.Size([120]) || stage6.residual_group1.blocks.1.attn.proj.bias
+ |  0.000 | -0.199 |  0.173 |  0.031 | torch.Size([360, 120]) || stage6.residual_group1.blocks.1.attn.qkv_mut.weight
+ |  0.000 | -0.035 |  0.056 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.744 |  0.530 |  0.917 |  0.066 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.weight
+ |  0.004 | -0.131 |  0.180 |  0.059 | torch.Size([120]) || stage6.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.243 |  0.294 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.039 | -0.217 |  0.045 |  0.037 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.206 |  0.178 |  0.033 | torch.Size([240, 120]) || stage6.residual_group1.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.129 |  0.125 |  0.028 | torch.Size([240]) || stage6.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.236 |  0.276 |  0.040 | torch.Size([120, 240]) || stage6.residual_group1.blocks.1.mlp.fc2.weight
+ |  0.000 | -0.158 |  0.170 |  0.063 | torch.Size([120]) || stage6.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.829 |  0.586 |  1.007 |  0.078 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.weight
+ | -0.101 | -0.353 |  0.132 |  0.092 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm1.bias
+ | -0.000 | -0.082 |  0.076 |  0.021 | torch.Size([675, 6]) || stage6.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.2.attn.position_bias
+ |  0.000 | -0.154 |  0.143 |  0.032 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.041 |  0.038 |  0.012 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.187 |  0.202 |  0.035 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.096 |  0.127 |  0.041 | torch.Size([120]) || stage6.residual_group1.blocks.2.attn.proj.bias
+ | -0.000 | -0.203 |  0.185 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.2.attn.qkv_mut.weight
+ | -0.000 | -0.045 |  0.049 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.768 |  0.491 |  0.904 |  0.069 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.weight
+ |  0.001 | -0.146 |  0.159 |  0.062 | torch.Size([120]) || stage6.residual_group1.blocks.2.norm2.bias
+ | -0.000 | -0.184 |  0.204 |  0.037 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.043 | -0.185 |  0.020 |  0.035 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.188 |  0.270 |  0.035 | torch.Size([240, 120]) || stage6.residual_group1.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.152 |  0.134 |  0.031 | torch.Size([240]) || stage6.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.222 |  0.217 |  0.042 | torch.Size([120, 240]) || stage6.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.002 | -0.141 |  0.144 |  0.058 | torch.Size([120]) || stage6.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.820 |  0.554 |  0.976 |  0.065 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.weight
+ | -0.091 | -0.336 |  0.137 |  0.087 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm1.bias
+ |  0.000 | -0.124 |  0.222 |  0.023 | torch.Size([675, 6]) || stage6.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.3.attn.position_bias
+ |  0.000 | -0.157 |  0.175 |  0.036 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_self.weight
+ | -0.001 | -0.049 |  0.049 |  0.014 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.238 |  0.236 |  0.036 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.attn.proj.weight
+ | -0.003 | -0.077 |  0.074 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.3.attn.proj.bias
+ |  0.000 | -0.212 |  0.265 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.000 | -0.028 |  0.052 |  0.009 | torch.Size([360]) || stage6.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.768 |  0.530 |  0.903 |  0.080 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.weight
+ |  0.002 | -0.104 |  0.157 |  0.044 | torch.Size([120]) || stage6.residual_group1.blocks.3.norm2.bias
+ | -0.000 | -0.197 |  0.220 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.042 | -0.155 |  0.043 |  0.039 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.166 |  0.199 |  0.036 | torch.Size([240, 120]) || stage6.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.102 |  0.138 |  0.040 | torch.Size([240]) || stage6.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.241 |  0.256 |  0.044 | torch.Size([120, 240]) || stage6.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.003 | -0.123 |  0.115 |  0.046 | torch.Size([120]) || stage6.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.817 |  0.631 |  0.918 |  0.055 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.weight
+ | -0.082 | -0.295 |  0.141 |  0.074 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm1.bias
+ | -0.000 | -0.084 |  0.205 |  0.024 | torch.Size([675, 6]) || stage6.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -0.174 |  0.199 |  0.040 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_self.weight
+ | -0.000 | -0.060 |  0.081 |  0.017 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.000 | -0.194 |  0.191 |  0.037 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.attn.proj.weight
+ |  0.001 | -0.083 |  0.077 |  0.035 | torch.Size([120]) || stage6.residual_group1.blocks.4.attn.proj.bias
+ | -0.000 | -0.218 |  0.243 |  0.033 | torch.Size([360, 120]) || stage6.residual_group1.blocks.4.attn.qkv_mut.weight
+ | -0.000 | -0.031 |  0.024 |  0.007 | torch.Size([360]) || stage6.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.744 |  0.478 |  0.913 |  0.082 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.weight
+ | -0.003 | -0.146 |  0.110 |  0.053 | torch.Size([120]) || stage6.residual_group1.blocks.4.norm2.bias
+ | -0.000 | -0.223 |  0.238 |  0.042 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.046 | -0.200 |  0.071 |  0.051 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc11.bias
+ | -0.000 | -0.168 |  0.201 |  0.039 | torch.Size([240, 120]) || stage6.residual_group1.blocks.4.mlp.fc12.weight
+ |  0.002 | -0.128 |  0.141 |  0.053 | torch.Size([240]) || stage6.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.000 | -0.220 |  0.205 |  0.047 | torch.Size([120, 240]) || stage6.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.086 |  0.094 |  0.034 | torch.Size([120]) || stage6.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.754 |  0.353 |  0.933 |  0.056 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.weight
+ | -0.058 | -0.246 |  0.105 |  0.060 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm1.bias
+ | -0.000 | -0.113 |  0.536 |  0.030 | torch.Size([675, 6]) || stage6.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage6.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage6.residual_group1.blocks.5.attn.position_bias
+ |  0.000 | -0.261 |  0.224 |  0.044 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_self.weight
+ |  0.002 | -0.050 |  0.067 |  0.018 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.000 | -0.234 |  0.256 |  0.038 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.attn.proj.weight
+ |  0.002 | -0.079 |  0.076 |  0.036 | torch.Size([120]) || stage6.residual_group1.blocks.5.attn.proj.bias
+ | -0.000 | -0.211 |  0.231 |  0.029 | torch.Size([360, 120]) || stage6.residual_group1.blocks.5.attn.qkv_mut.weight
+ |  0.000 | -0.033 |  0.030 |  0.008 | torch.Size([360]) || stage6.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.677 |  0.275 |  0.833 |  0.083 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.weight
+ |  0.001 | -0.224 |  0.306 |  0.102 | torch.Size([120]) || stage6.residual_group1.blocks.5.norm2.bias
+ | -0.000 | -0.196 |  0.211 |  0.045 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.061 | -0.289 |  0.136 |  0.089 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.271 |  0.312 |  0.048 | torch.Size([240, 120]) || stage6.residual_group1.blocks.5.mlp.fc12.weight
+ |  0.003 | -0.166 |  0.155 |  0.075 | torch.Size([240]) || stage6.residual_group1.blocks.5.mlp.fc12.bias
+ |  0.000 | -0.286 |  0.375 |  0.054 | torch.Size([120, 240]) || stage6.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.005 | -0.054 |  0.137 |  0.031 | torch.Size([120]) || stage6.residual_group1.blocks.5.mlp.fc2.bias
+ | -0.000 | -0.174 |  0.172 |  0.039 | torch.Size([120, 120]) || stage6.linear1.weight
+ |  0.002 | -0.275 |  0.348 |  0.113 | torch.Size([120]) || stage6.linear1.bias
+ |  0.704 |  0.402 |  1.002 |  0.132 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.weight
+ |  0.001 | -0.466 |  0.407 |  0.157 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm1.bias
+ | -0.000 | -0.172 |  0.570 |  0.025 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.0.attn.relative_position_index
+ |  0.000 | -0.337 |  0.378 |  0.041 | torch.Size([360, 120]) || stage6.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.000 | -0.071 |  0.068 |  0.019 | torch.Size([360]) || stage6.residual_group2.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.290 |  0.321 |  0.055 | torch.Size([120, 120]) || stage6.residual_group2.blocks.0.attn.proj.weight
+ |  0.001 | -0.255 |  0.250 |  0.104 | torch.Size([120]) || stage6.residual_group2.blocks.0.attn.proj.bias
+ |  0.695 |  0.353 |  0.966 |  0.098 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.weight
+ | -0.001 | -0.218 |  0.165 |  0.080 | torch.Size([120]) || stage6.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.259 |  0.255 |  0.039 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.044 | -0.256 |  0.042 |  0.047 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.234 |  0.214 |  0.035 | torch.Size([240, 120]) || stage6.residual_group2.blocks.0.mlp.fc12.weight
+ |  0.002 | -0.133 |  0.091 |  0.027 | torch.Size([240]) || stage6.residual_group2.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.333 |  0.296 |  0.042 | torch.Size([120, 240]) || stage6.residual_group2.blocks.0.mlp.fc2.weight
+ |  0.003 | -0.238 |  0.280 |  0.092 | torch.Size([120]) || stage6.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.671 |  0.425 |  0.980 |  0.094 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.weight
+ |  0.001 | -0.261 |  0.305 |  0.119 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm1.bias
+ | -0.000 | -0.372 |  0.942 |  0.031 | torch.Size([2475, 6]) || stage6.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage6.residual_group2.blocks.1.attn.relative_position_index
+ |  0.000 | -0.450 |  0.494 |  0.045 | torch.Size([360, 120]) || stage6.residual_group2.blocks.1.attn.qkv_self.weight
+ |  0.000 | -0.133 |  0.119 |  0.029 | torch.Size([360]) || stage6.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.239 |  0.288 |  0.046 | torch.Size([120, 120]) || stage6.residual_group2.blocks.1.attn.proj.weight
+ | -0.001 | -0.187 |  0.157 |  0.064 | torch.Size([120]) || stage6.residual_group2.blocks.1.attn.proj.bias
+ |  0.687 |  0.160 |  0.907 |  0.128 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.weight
+ | -0.002 | -0.192 |  0.222 |  0.084 | torch.Size([120]) || stage6.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.257 |  0.426 |  0.042 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.064 | -0.207 |  0.036 |  0.048 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.269 |  0.224 |  0.038 | torch.Size([240, 120]) || stage6.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.000 | -0.126 |  0.129 |  0.030 | torch.Size([240]) || stage6.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.308 |  0.298 |  0.041 | torch.Size([120, 240]) || stage6.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.004 | -0.180 |  0.192 |  0.061 | torch.Size([120]) || stage6.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.000 | -0.297 |  0.368 |  0.069 | torch.Size([120, 120]) || stage6.linear2.weight
+ |  0.001 | -0.431 |  0.480 |  0.189 | torch.Size([120]) || stage6.linear2.bias
+ |  0.000 | -0.100 |  0.104 |  0.023 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.weight
+ |  0.001 | -0.018 |  0.029 |  0.010 | torch.Size([120]) || stage6.pa_deform.bias
+ |  0.000 | -0.105 |  0.111 |  0.015 | torch.Size([120, 242, 3, 3]) || stage6.pa_deform.conv_offset.0.weight
+ | -0.007 | -0.033 |  0.024 |  0.014 | torch.Size([120]) || stage6.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.071 |  0.067 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.2.weight
+ | -0.003 | -0.061 |  0.043 |  0.022 | torch.Size([120]) || stage6.pa_deform.conv_offset.2.bias
+ | -0.000 | -0.074 |  0.068 |  0.019 | torch.Size([120, 120, 3, 3]) || stage6.pa_deform.conv_offset.4.weight
+ |  0.001 | -0.075 |  0.056 |  0.030 | torch.Size([120]) || stage6.pa_deform.conv_offset.4.bias
+ |  0.001 | -0.124 |  0.108 |  0.013 | torch.Size([324, 120, 3, 3]) || stage6.pa_deform.conv_offset.6.weight
+ | -0.001 | -0.113 |  0.076 |  0.021 | torch.Size([324]) || stage6.pa_deform.conv_offset.6.bias
+ | -0.001 | -0.517 |  0.524 |  0.101 | torch.Size([360, 360]) || stage6.pa_fuse.fc11.weight
+ |  0.154 | -0.305 |  0.679 |  0.180 | torch.Size([360]) || stage6.pa_fuse.fc11.bias
+ |  0.000 | -0.680 |  0.728 |  0.103 | torch.Size([360, 360]) || stage6.pa_fuse.fc12.weight
+ |  0.020 | -0.514 |  0.417 |  0.199 | torch.Size([360]) || stage6.pa_fuse.fc12.bias
+ | -0.000 | -0.587 |  0.737 |  0.135 | torch.Size([120, 360]) || stage6.pa_fuse.fc2.weight
+ |  0.015 | -0.437 |  0.490 |  0.230 | torch.Size([120]) || stage6.pa_fuse.fc2.bias
+ |  1.284 |  1.119 |  1.404 |  0.055 | torch.Size([30]) || stage7.reshape.1.weight
+ | -0.014 | -0.286 |  0.184 |  0.122 | torch.Size([30]) || stage7.reshape.1.bias
+ | -0.000 | -0.521 |  0.576 |  0.154 | torch.Size([120, 30]) || stage7.reshape.2.weight
+ |  0.004 | -0.387 |  0.738 |  0.175 | torch.Size([120]) || stage7.reshape.2.bias
+ |  0.440 |  0.099 |  0.775 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.weight
+ | -0.177 | -0.670 |  0.319 |  0.183 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm1.bias
+ | -0.055 | -2.159 |  1.979 |  0.240 | torch.Size([675, 6]) || stage7.residual_group1.blocks.0.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.0.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.0.attn.position_bias
+ | -0.000 | -0.535 |  0.554 |  0.104 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_self.weight
+ |  0.003 | -0.193 |  0.281 |  0.053 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.397 |  0.395 |  0.075 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.attn.proj.weight
+ | -0.001 | -0.232 |  0.692 |  0.106 | torch.Size([120]) || stage7.residual_group1.blocks.0.attn.proj.bias
+ | -0.000 | -0.899 |  1.073 |  0.091 | torch.Size([360, 120]) || stage7.residual_group1.blocks.0.attn.qkv_mut.weight
+ | -0.000 | -0.122 |  0.104 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.0.attn.qkv_mut.bias
+ |  0.310 |  0.157 |  0.440 |  0.055 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.weight
+ |  0.006 | -0.474 |  0.266 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.0.norm2.bias
+ | -0.000 | -0.605 |  0.490 |  0.115 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc11.weight
+ | -0.101 | -0.310 |  0.126 |  0.070 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.448 |  0.475 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.0.mlp.fc12.weight
+ |  0.006 | -0.185 |  0.215 |  0.071 | torch.Size([240]) || stage7.residual_group1.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.465 |  0.512 |  0.122 | torch.Size([120, 240]) || stage7.residual_group1.blocks.0.mlp.fc2.weight
+ |  0.000 | -0.150 |  0.417 |  0.077 | torch.Size([120]) || stage7.residual_group1.blocks.0.mlp.fc2.bias
+ |  0.577 |  0.165 |  0.829 |  0.105 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.weight
+ | -0.136 | -0.849 |  0.206 |  0.141 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm1.bias
+ | -0.143 | -3.020 |  4.621 |  0.357 | torch.Size([675, 6]) || stage7.residual_group1.blocks.1.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.1.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.1.attn.position_bias
+ |  0.000 | -0.647 |  0.640 |  0.123 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_self.weight
+ | -0.002 | -0.356 |  0.382 |  0.064 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.457 |  0.378 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.attn.proj.weight
+ |  0.000 | -0.250 |  0.707 |  0.108 | torch.Size([120]) || stage7.residual_group1.blocks.1.attn.proj.bias
+ | -0.001 | -1.055 |  1.091 |  0.096 | torch.Size([360, 120]) || stage7.residual_group1.blocks.1.attn.qkv_mut.weight
+ | -0.001 | -0.093 |  0.123 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.1.attn.qkv_mut.bias
+ |  0.411 |  0.265 |  0.535 |  0.044 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.weight
+ |  0.008 | -0.630 |  0.264 |  0.121 | torch.Size([120]) || stage7.residual_group1.blocks.1.norm2.bias
+ |  0.000 | -0.501 |  0.506 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc11.weight
+ | -0.087 | -0.341 |  0.140 |  0.073 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.450 |  0.527 |  0.119 | torch.Size([240, 120]) || stage7.residual_group1.blocks.1.mlp.fc12.weight
+ |  0.005 | -0.188 |  0.171 |  0.063 | torch.Size([240]) || stage7.residual_group1.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.554 |  0.546 |  0.121 | torch.Size([120, 240]) || stage7.residual_group1.blocks.1.mlp.fc2.weight
+ | -0.000 | -0.135 |  0.220 |  0.061 | torch.Size([120]) || stage7.residual_group1.blocks.1.mlp.fc2.bias
+ |  0.655 |  0.134 |  0.896 |  0.130 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.weight
+ | -0.139 | -0.788 |  0.181 |  0.115 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm1.bias
+ | -0.062 | -3.469 |  3.276 |  0.272 | torch.Size([675, 6]) || stage7.residual_group1.blocks.2.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.2.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.2.attn.position_bias
+ | -0.000 | -0.592 |  0.650 |  0.124 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_self.weight
+ | -0.000 | -0.308 |  0.218 |  0.062 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.355 |  0.345 |  0.082 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.attn.proj.weight
+ |  0.002 | -0.213 |  0.700 |  0.097 | torch.Size([120]) || stage7.residual_group1.blocks.2.attn.proj.bias
+ | -0.001 | -1.166 |  0.942 |  0.107 | torch.Size([360, 120]) || stage7.residual_group1.blocks.2.attn.qkv_mut.weight
+ |  0.000 | -0.106 |  0.093 |  0.018 | torch.Size([360]) || stage7.residual_group1.blocks.2.attn.qkv_mut.bias
+ |  0.466 |  0.317 |  0.565 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.weight
+ |  0.014 | -0.657 |  0.280 |  0.118 | torch.Size([120]) || stage7.residual_group1.blocks.2.norm2.bias
+ |  0.000 | -0.541 |  0.494 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc11.weight
+ | -0.079 | -0.335 |  0.122 |  0.080 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.513 |  0.493 |  0.123 | torch.Size([240, 120]) || stage7.residual_group1.blocks.2.mlp.fc12.weight
+ | -0.007 | -0.180 |  0.175 |  0.066 | torch.Size([240]) || stage7.residual_group1.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.509 |  0.479 |  0.123 | torch.Size([120, 240]) || stage7.residual_group1.blocks.2.mlp.fc2.weight
+ |  0.004 | -0.093 |  0.293 |  0.054 | torch.Size([120]) || stage7.residual_group1.blocks.2.mlp.fc2.bias
+ |  0.693 |  0.147 |  0.945 |  0.133 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.weight
+ | -0.132 | -0.906 |  0.249 |  0.113 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm1.bias
+ | -0.108 | -3.576 |  4.241 |  0.344 | torch.Size([675, 6]) || stage7.residual_group1.blocks.3.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.3.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.3.attn.position_bias
+ | -0.000 | -0.945 |  1.095 |  0.129 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_self.weight
+ |  0.003 | -0.274 |  0.204 |  0.061 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.379 |  0.351 |  0.081 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.attn.proj.weight
+ |  0.000 | -0.211 |  0.587 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.3.attn.proj.bias
+ | -0.000 | -1.269 |  1.067 |  0.102 | torch.Size([360, 120]) || stage7.residual_group1.blocks.3.attn.qkv_mut.weight
+ |  0.001 | -0.091 |  0.117 |  0.021 | torch.Size([360]) || stage7.residual_group1.blocks.3.attn.qkv_mut.bias
+ |  0.499 |  0.285 |  0.570 |  0.040 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.weight
+ |  0.012 | -0.567 |  0.273 |  0.104 | torch.Size([120]) || stage7.residual_group1.blocks.3.norm2.bias
+ |  0.001 | -0.528 |  0.499 |  0.118 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc11.weight
+ | -0.084 | -0.349 |  0.141 |  0.078 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.547 |  0.592 |  0.126 | torch.Size([240, 120]) || stage7.residual_group1.blocks.3.mlp.fc12.weight
+ |  0.002 | -0.154 |  0.176 |  0.068 | torch.Size([240]) || stage7.residual_group1.blocks.3.mlp.fc12.bias
+ |  0.001 | -0.520 |  0.480 |  0.125 | torch.Size([120, 240]) || stage7.residual_group1.blocks.3.mlp.fc2.weight
+ |  0.001 | -0.150 |  0.207 |  0.065 | torch.Size([120]) || stage7.residual_group1.blocks.3.mlp.fc2.bias
+ |  0.726 |  0.137 |  1.004 |  0.160 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.weight
+ | -0.122 | -0.907 |  0.180 |  0.103 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm1.bias
+ | -0.078 | -3.824 |  4.241 |  0.297 | torch.Size([675, 6]) || stage7.residual_group1.blocks.4.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.4.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.4.attn.position_bias
+ | -0.000 | -1.188 |  0.796 |  0.127 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_self.weight
+ |  0.002 | -0.248 |  0.207 |  0.056 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_self.bias
+ | -0.001 | -0.409 |  0.369 |  0.085 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.attn.proj.weight
+ |  0.002 | -0.224 |  0.322 |  0.094 | torch.Size([120]) || stage7.residual_group1.blocks.4.attn.proj.bias
+ |  0.000 | -1.744 |  1.273 |  0.110 | torch.Size([360, 120]) || stage7.residual_group1.blocks.4.attn.qkv_mut.weight
+ |  0.001 | -0.092 |  0.113 |  0.019 | torch.Size([360]) || stage7.residual_group1.blocks.4.attn.qkv_mut.bias
+ |  0.514 |  0.277 |  0.614 |  0.041 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.weight
+ |  0.016 | -0.621 |  0.286 |  0.095 | torch.Size([120]) || stage7.residual_group1.blocks.4.norm2.bias
+ |  0.001 | -0.517 |  0.453 |  0.116 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc11.weight
+ | -0.064 | -0.260 |  0.143 |  0.083 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc11.bias
+ |  0.000 | -0.503 |  0.554 |  0.129 | torch.Size([240, 120]) || stage7.residual_group1.blocks.4.mlp.fc12.weight
+ | -0.004 | -0.232 |  0.193 |  0.075 | torch.Size([240]) || stage7.residual_group1.blocks.4.mlp.fc12.bias
+ | -0.001 | -0.595 |  0.543 |  0.128 | torch.Size([120, 240]) || stage7.residual_group1.blocks.4.mlp.fc2.weight
+ |  0.001 | -0.196 |  0.198 |  0.071 | torch.Size([120]) || stage7.residual_group1.blocks.4.mlp.fc2.bias
+ |  0.731 |  0.152 |  1.075 |  0.114 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.weight
+ | -0.076 | -1.003 |  0.176 |  0.107 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm1.bias
+ | -0.121 | -3.281 |  4.671 |  0.296 | torch.Size([675, 6]) || stage7.residual_group1.blocks.5.attn.relative_position_bias_table
+ | 337.000 |  0.000 | 674.000 | 166.395 | torch.Size([128, 128]) || stage7.residual_group1.blocks.5.attn.relative_position_index
+ |  0.487 | -1.000 |  1.000 |  0.512 | torch.Size([1, 64, 120]) || stage7.residual_group1.blocks.5.attn.position_bias
+ | -0.000 | -0.640 |  1.083 |  0.122 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_self.weight
+ | -0.001 | -0.239 |  0.314 |  0.068 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_self.bias
+ |  0.001 | -0.344 |  0.452 |  0.078 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.attn.proj.weight
+ |  0.004 | -0.361 |  0.251 |  0.093 | torch.Size([120]) || stage7.residual_group1.blocks.5.attn.proj.bias
+ |  0.000 | -0.637 |  0.806 |  0.093 | torch.Size([360, 120]) || stage7.residual_group1.blocks.5.attn.qkv_mut.weight
+ | -0.000 | -0.088 |  0.091 |  0.017 | torch.Size([360]) || stage7.residual_group1.blocks.5.attn.qkv_mut.bias
+ |  0.514 |  0.238 |  0.594 |  0.042 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.weight
+ |  0.017 | -0.650 |  0.162 |  0.089 | torch.Size([120]) || stage7.residual_group1.blocks.5.norm2.bias
+ |  0.000 | -0.442 |  0.479 |  0.114 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc11.weight
+ | -0.040 | -0.400 |  0.203 |  0.101 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc11.bias
+ | -0.000 | -0.541 |  0.514 |  0.130 | torch.Size([240, 120]) || stage7.residual_group1.blocks.5.mlp.fc12.weight
+ | -0.008 | -0.319 |  0.309 |  0.092 | torch.Size([240]) || stage7.residual_group1.blocks.5.mlp.fc12.bias
+ | -0.000 | -1.018 |  1.398 |  0.130 | torch.Size([120, 240]) || stage7.residual_group1.blocks.5.mlp.fc2.weight
+ |  0.001 | -1.606 |  0.269 |  0.179 | torch.Size([120]) || stage7.residual_group1.blocks.5.mlp.fc2.bias
+ |  0.000 | -0.186 |  0.207 |  0.048 | torch.Size([120, 120]) || stage7.linear1.weight
+ |  0.010 | -0.448 |  0.437 |  0.161 | torch.Size([120]) || stage7.linear1.bias
+ |  0.703 |  0.381 |  0.856 |  0.084 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.weight
+ |  0.014 | -0.645 |  0.486 |  0.169 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm1.bias
+ | -0.007 | -4.468 |  1.008 |  0.164 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.0.attn.relative_position_index
+ | -0.000 | -0.625 |  0.834 |  0.120 | torch.Size([360, 120]) || stage7.residual_group2.blocks.0.attn.qkv_self.weight
+ | -0.009 | -0.737 |  0.632 |  0.135 | torch.Size([360]) || stage7.residual_group2.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.403 |  0.406 |  0.088 | torch.Size([120, 120]) || stage7.residual_group2.blocks.0.attn.proj.weight
+ | -0.007 | -0.338 |  0.165 |  0.070 | torch.Size([120]) || stage7.residual_group2.blocks.0.attn.proj.bias
+ |  0.435 |  0.323 |  0.526 |  0.038 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.weight
+ |  0.005 | -0.678 |  0.379 |  0.117 | torch.Size([120]) || stage7.residual_group2.blocks.0.norm2.bias
+ |  0.000 | -0.465 |  0.467 |  0.110 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc11.weight
+ | -0.031 | -0.236 |  0.180 |  0.077 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.490 |  0.520 |  0.121 | torch.Size([240, 120]) || stage7.residual_group2.blocks.0.mlp.fc12.weight
+ | -0.003 | -0.197 |  0.242 |  0.069 | torch.Size([240]) || stage7.residual_group2.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.525 |  0.501 |  0.122 | torch.Size([120, 240]) || stage7.residual_group2.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.431 |  0.164 |  0.077 | torch.Size([120]) || stage7.residual_group2.blocks.0.mlp.fc2.bias
+ |  0.703 |  0.306 |  0.866 |  0.079 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.weight
+ |  0.009 | -0.647 |  0.481 |  0.149 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm1.bias
+ | -0.010 | -3.504 |  1.842 |  0.134 | torch.Size([2475, 6]) || stage7.residual_group2.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage7.residual_group2.blocks.1.attn.relative_position_index
+ | -0.000 | -0.639 |  0.590 |  0.122 | torch.Size([360, 120]) || stage7.residual_group2.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.613 |  0.609 |  0.148 | torch.Size([360]) || stage7.residual_group2.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.316 |  0.325 |  0.085 | torch.Size([120, 120]) || stage7.residual_group2.blocks.1.attn.proj.weight
+ | -0.004 | -0.350 |  0.145 |  0.069 | torch.Size([120]) || stage7.residual_group2.blocks.1.attn.proj.bias
+ |  0.452 |  0.309 |  0.558 |  0.037 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.weight
+ |  0.003 | -0.661 |  0.246 |  0.091 | torch.Size([120]) || stage7.residual_group2.blocks.1.norm2.bias
+ |  0.000 | -0.580 |  0.410 |  0.108 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc11.weight
+ | -0.020 | -0.258 |  0.299 |  0.104 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.561 |  0.126 | torch.Size([240, 120]) || stage7.residual_group2.blocks.1.mlp.fc12.weight
+ | -0.002 | -0.234 |  0.434 |  0.090 | torch.Size([240]) || stage7.residual_group2.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.778 |  0.581 |  0.124 | torch.Size([120, 240]) || stage7.residual_group2.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.888 |  0.286 |  0.135 | torch.Size([120]) || stage7.residual_group2.blocks.1.mlp.fc2.bias
+ | -0.001 | -0.348 |  0.237 |  0.060 | torch.Size([120, 120]) || stage7.linear2.weight
+ |  0.023 | -0.390 |  0.506 |  0.167 | torch.Size([120]) || stage7.linear2.bias
+ | -0.000 | -0.104 |  0.107 |  0.024 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.weight
+ |  0.002 | -0.041 |  0.035 |  0.016 | torch.Size([120]) || stage7.pa_deform.bias
+ | -0.000 | -0.123 |  0.109 |  0.017 | torch.Size([120, 242, 3, 3]) || stage7.pa_deform.conv_offset.0.weight
+ | -0.002 | -0.034 |  0.032 |  0.015 | torch.Size([120]) || stage7.pa_deform.conv_offset.0.bias
+ | -0.001 | -0.111 |  0.084 |  0.019 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.2.weight
+ | -0.008 | -0.073 |  0.081 |  0.034 | torch.Size([120]) || stage7.pa_deform.conv_offset.2.bias
+ | -0.002 | -0.154 |  0.122 |  0.018 | torch.Size([120, 120, 3, 3]) || stage7.pa_deform.conv_offset.4.weight
+ |  0.014 | -0.041 |  0.068 |  0.026 | torch.Size([120]) || stage7.pa_deform.conv_offset.4.bias
+ | -0.001 | -0.408 |  0.365 |  0.034 | torch.Size([324, 120, 3, 3]) || stage7.pa_deform.conv_offset.6.weight
+ | -0.003 | -0.057 |  0.054 |  0.024 | torch.Size([324]) || stage7.pa_deform.conv_offset.6.bias
+ |  0.000 | -0.697 |  0.606 |  0.123 | torch.Size([360, 360]) || stage7.pa_fuse.fc11.weight
+ |  0.119 | -0.211 |  0.720 |  0.177 | torch.Size([360]) || stage7.pa_fuse.fc11.bias
+ |  0.000 | -1.175 |  0.924 |  0.154 | torch.Size([360, 360]) || stage7.pa_fuse.fc12.weight
+ | -0.000 | -0.581 |  0.580 |  0.190 | torch.Size([360]) || stage7.pa_fuse.fc12.bias
+ |  0.001 | -0.786 |  0.874 |  0.135 | torch.Size([120, 360]) || stage7.pa_fuse.fc2.weight
+ | -0.053 | -0.522 |  0.577 |  0.205 | torch.Size([120]) || stage7.pa_fuse.fc2.bias
+ |  1.225 |  1.000 |  1.516 |  0.095 | torch.Size([120]) || stage8.0.1.weight
+ | -0.013 | -0.413 |  0.465 |  0.139 | torch.Size([120]) || stage8.0.1.bias
+ |  0.000 | -2.505 |  0.627 |  0.136 | torch.Size([180, 120]) || stage8.0.2.weight
+ |  0.005 | -0.397 |  0.377 |  0.107 | torch.Size([180]) || stage8.0.2.bias
+ |  0.456 |  0.123 |  0.760 |  0.129 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.weight
+ | -0.022 | -0.343 |  0.875 |  0.099 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm1.bias
+ | -0.014 | -1.907 |  2.592 |  0.130 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.632 |  0.628 |  0.099 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.006 | -0.567 |  0.668 |  0.148 | torch.Size([540]) || stage8.1.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.477 |  0.447 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.0.attn.proj.weight
+ | -0.010 | -0.460 |  0.225 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.0.attn.proj.bias
+ |  0.429 |  0.119 |  0.634 |  0.090 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.weight
+ | -0.007 | -0.338 |  0.803 |  0.086 | torch.Size([180]) || stage8.1.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.572 |  0.539 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc11.weight
+ | -0.060 | -0.260 |  0.185 |  0.060 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.461 |  0.548 |  0.113 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.0.mlp.fc12.weight
+ |  0.000 | -0.163 |  0.183 |  0.050 | torch.Size([360]) || stage8.1.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.757 |  0.581 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.0.mlp.fc2.weight
+ | -0.003 | -0.191 |  0.121 |  0.057 | torch.Size([180]) || stage8.1.residual_group.blocks.0.mlp.fc2.bias
+ |  0.557 |  0.086 |  0.800 |  0.112 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.weight
+ | -0.029 | -0.230 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm1.bias
+ | -0.016 | -2.004 |  1.711 |  0.154 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.690 |  0.575 |  0.109 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.011 | -0.641 |  0.609 |  0.135 | torch.Size([540]) || stage8.1.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.466 |  0.401 |  0.094 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.1.attn.proj.weight
+ | -0.008 | -0.344 |  0.181 |  0.080 | torch.Size([180]) || stage8.1.residual_group.blocks.1.attn.proj.bias
+ |  0.503 |  0.226 |  0.742 |  0.093 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.404 |  0.818 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.1.norm2.bias
+ | -0.007 | -0.595 |  0.532 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc11.weight
+ | -0.068 | -0.261 |  0.071 |  0.053 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.529 |  0.573 |  0.116 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.129 |  0.197 |  0.046 | torch.Size([360]) || stage8.1.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.556 |  0.582 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.1.mlp.fc2.weight
+ | -0.003 | -0.170 |  0.145 |  0.052 | torch.Size([180]) || stage8.1.residual_group.blocks.1.mlp.fc2.bias
+ |  0.699 |  0.202 |  0.912 |  0.109 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.253 |  0.924 |  0.091 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm1.bias
+ | -0.030 | -2.510 |  2.088 |  0.194 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.637 |  0.801 |  0.116 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.006 | -0.512 |  0.520 |  0.110 | torch.Size([540]) || stage8.1.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.381 |  0.337 |  0.090 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.2.attn.proj.weight
+ | -0.011 | -0.238 |  0.234 |  0.085 | torch.Size([180]) || stage8.1.residual_group.blocks.2.attn.proj.bias
+ |  0.594 |  0.150 |  0.810 |  0.108 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.weight
+ | -0.010 | -0.483 |  0.726 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.567 |  0.499 |  0.125 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc11.weight
+ | -0.077 | -0.360 |  0.050 |  0.056 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.536 |  0.673 |  0.119 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.142 |  0.186 |  0.043 | torch.Size([360]) || stage8.1.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.536 |  0.524 |  0.119 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.2.mlp.fc2.weight
+ | -0.006 | -0.147 |  0.133 |  0.051 | torch.Size([180]) || stage8.1.residual_group.blocks.2.mlp.fc2.bias
+ |  0.683 |  0.141 |  0.908 |  0.105 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.199 |  0.878 |  0.088 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm1.bias
+ | -0.039 | -1.527 |  3.891 |  0.199 | torch.Size([2475, 6]) || stage8.1.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.1.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.682 |  0.693 |  0.120 | torch.Size([540, 180]) || stage8.1.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.007 | -0.543 |  0.513 |  0.138 | torch.Size([540]) || stage8.1.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.390 |  0.476 |  0.089 | torch.Size([180, 180]) || stage8.1.residual_group.blocks.3.attn.proj.weight
+ | -0.007 | -0.176 |  0.150 |  0.062 | torch.Size([180]) || stage8.1.residual_group.blocks.3.attn.proj.bias
+ |  0.640 |  0.094 |  0.853 |  0.120 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.weight
+ | -0.009 | -0.372 |  0.683 |  0.084 | torch.Size([180]) || stage8.1.residual_group.blocks.3.norm2.bias
+ | -0.006 | -0.628 |  0.521 |  0.126 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc11.weight
+ | -0.089 | -0.367 |  0.047 |  0.054 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.629 |  0.562 |  0.121 | torch.Size([360, 180]) || stage8.1.residual_group.blocks.3.mlp.fc12.weight
+ | -0.001 | -0.186 |  0.128 |  0.042 | torch.Size([360]) || stage8.1.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.485 |  0.499 |  0.118 | torch.Size([180, 360]) || stage8.1.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.138 |  0.209 |  0.050 | torch.Size([180]) || stage8.1.residual_group.blocks.3.mlp.fc2.bias
+ |  0.000 | -0.294 |  0.577 |  0.071 | torch.Size([180, 180]) || stage8.1.linear.weight
+ |  0.004 | -0.349 |  0.235 |  0.072 | torch.Size([180]) || stage8.1.linear.bias
+ |  0.708 |  0.242 |  1.026 |  0.136 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.weight
+ | -0.032 | -0.212 |  0.830 |  0.100 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm1.bias
+ | -0.039 | -1.954 |  2.394 |  0.212 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.922 |  0.646 |  0.116 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.429 |  0.524 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.000 | -0.467 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.0.attn.proj.weight
+ | -0.005 | -0.339 |  0.264 |  0.095 | torch.Size([180]) || stage8.2.residual_group.blocks.0.attn.proj.bias
+ |  0.587 |  0.255 |  0.837 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.weight
+ | -0.011 | -0.285 |  0.721 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.0.norm2.bias
+ | -0.006 | -0.586 |  0.534 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc11.weight
+ | -0.075 | -0.225 |  0.066 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.493 |  0.532 |  0.123 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.0.mlp.fc12.weight
+ |  0.003 | -0.189 |  0.178 |  0.047 | torch.Size([360]) || stage8.2.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.551 |  0.543 |  0.124 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.0.mlp.fc2.weight
+ | -0.010 | -0.154 |  0.142 |  0.054 | torch.Size([180]) || stage8.2.residual_group.blocks.0.mlp.fc2.bias
+ |  0.773 |  0.210 |  1.004 |  0.113 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.weight
+ | -0.035 | -0.176 |  0.873 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm1.bias
+ | -0.027 | -2.407 |  1.736 |  0.214 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.817 |  0.977 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.001 | -0.659 |  0.461 |  0.115 | torch.Size([540]) || stage8.2.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.000 | -0.484 |  0.453 |  0.109 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.1.attn.proj.weight
+ | -0.014 | -0.315 |  0.252 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.1.attn.proj.bias
+ |  0.641 |  0.337 |  0.810 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.weight
+ | -0.011 | -0.177 |  0.806 |  0.083 | torch.Size([180]) || stage8.2.residual_group.blocks.1.norm2.bias
+ | -0.006 | -0.569 |  0.598 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc11.weight
+ | -0.079 | -0.323 |  0.071 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.512 |  0.577 |  0.126 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.1.mlp.fc12.weight
+ | -0.003 | -0.142 |  0.161 |  0.050 | torch.Size([360]) || stage8.2.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.529 |  0.572 |  0.125 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.1.mlp.fc2.weight
+ | -0.010 | -0.178 |  0.159 |  0.066 | torch.Size([180]) || stage8.2.residual_group.blocks.1.mlp.fc2.bias
+ |  0.857 |  0.199 |  1.153 |  0.112 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.weight
+ | -0.039 | -0.189 |  0.943 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm1.bias
+ | -0.042 | -1.962 |  2.773 |  0.246 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.783 |  0.655 |  0.123 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.004 | -0.338 |  0.533 |  0.099 | torch.Size([540]) || stage8.2.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.497 |  0.461 |  0.107 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.2.attn.proj.weight
+ | -0.008 | -0.288 |  0.183 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.2.attn.proj.bias
+ |  0.681 |  0.327 |  0.878 |  0.085 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.weight
+ | -0.012 | -0.178 |  0.773 |  0.084 | torch.Size([180]) || stage8.2.residual_group.blocks.2.norm2.bias
+ | -0.006 | -0.789 |  0.546 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc11.weight
+ | -0.081 | -0.249 |  0.036 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.526 |  0.555 |  0.128 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.2.mlp.fc12.weight
+ |  0.000 | -0.133 |  0.191 |  0.051 | torch.Size([360]) || stage8.2.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.572 |  0.529 |  0.126 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.164 |  0.147 |  0.065 | torch.Size([180]) || stage8.2.residual_group.blocks.2.mlp.fc2.bias
+ |  0.877 |  0.198 |  1.043 |  0.094 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.weight
+ | -0.038 | -0.210 |  0.916 |  0.091 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm1.bias
+ | -0.094 | -2.974 |  4.987 |  0.299 | torch.Size([2475, 6]) || stage8.2.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.2.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.964 |  1.011 |  0.126 | torch.Size([540, 180]) || stage8.2.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.002 | -0.404 |  0.429 |  0.101 | torch.Size([540]) || stage8.2.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.501 |  0.489 |  0.110 | torch.Size([180, 180]) || stage8.2.residual_group.blocks.3.attn.proj.weight
+ | -0.021 | -0.305 |  0.208 |  0.097 | torch.Size([180]) || stage8.2.residual_group.blocks.3.attn.proj.bias
+ |  0.697 |  0.295 |  0.894 |  0.089 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.weight
+ | -0.015 | -0.241 |  0.712 |  0.086 | torch.Size([180]) || stage8.2.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.562 |  0.573 |  0.125 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc11.weight
+ | -0.085 | -0.302 |  0.080 |  0.060 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.734 |  0.573 |  0.130 | torch.Size([360, 180]) || stage8.2.residual_group.blocks.3.mlp.fc12.weight
+ |  0.001 | -0.150 |  0.161 |  0.054 | torch.Size([360]) || stage8.2.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.671 |  0.623 |  0.127 | torch.Size([180, 360]) || stage8.2.residual_group.blocks.3.mlp.fc2.weight
+ | -0.023 | -0.252 |  0.317 |  0.081 | torch.Size([180]) || stage8.2.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.278 |  0.345 |  0.064 | torch.Size([180, 180]) || stage8.2.linear.weight
+ |  0.004 | -0.315 |  0.148 |  0.064 | torch.Size([180]) || stage8.2.linear.bias
+ |  0.850 |  0.326 |  1.087 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.weight
+ | -0.031 | -0.334 |  0.779 |  0.106 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm1.bias
+ | -0.012 | -2.917 |  1.476 |  0.175 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.603 |  0.666 |  0.124 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.001 | -0.374 |  0.381 |  0.086 | torch.Size([540]) || stage8.3.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.577 |  0.605 |  0.119 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.0.attn.proj.weight
+ | -0.008 | -0.394 |  0.499 |  0.134 | torch.Size([180]) || stage8.3.residual_group.blocks.0.attn.proj.bias
+ |  0.636 |  0.321 |  0.790 |  0.073 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.weight
+ | -0.013 | -0.294 |  0.774 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.0.norm2.bias
+ | -0.004 | -0.540 |  0.539 |  0.123 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc11.weight
+ | -0.065 | -0.212 |  0.047 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.608 |  0.603 |  0.130 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.0.mlp.fc12.weight
+ | -0.002 | -0.177 |  0.155 |  0.051 | torch.Size([360]) || stage8.3.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.573 |  0.630 |  0.129 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.0.mlp.fc2.weight
+ | -0.005 | -0.189 |  0.178 |  0.071 | torch.Size([180]) || stage8.3.residual_group.blocks.0.mlp.fc2.bias
+ |  0.899 |  0.275 |  1.048 |  0.099 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.weight
+ | -0.031 | -0.223 |  0.771 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm1.bias
+ | -0.003 | -3.151 |  1.718 |  0.202 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.1.attn.relative_position_index
+ | -0.000 | -0.732 |  0.868 |  0.127 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.412 |  0.350 |  0.093 | torch.Size([540]) || stage8.3.residual_group.blocks.1.attn.qkv_self.bias
+ |  0.001 | -0.466 |  0.487 |  0.114 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.1.attn.proj.weight
+ | -0.006 | -0.388 |  0.400 |  0.129 | torch.Size([180]) || stage8.3.residual_group.blocks.1.attn.proj.bias
+ |  0.711 |  0.381 |  0.864 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.weight
+ | -0.009 | -0.240 |  0.692 |  0.090 | torch.Size([180]) || stage8.3.residual_group.blocks.1.norm2.bias
+ | -0.005 | -0.657 |  0.639 |  0.126 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc11.weight
+ | -0.077 | -0.263 |  0.047 |  0.057 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.673 |  0.605 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.1.mlp.fc12.weight
+ |  0.002 | -0.158 |  0.155 |  0.046 | torch.Size([360]) || stage8.3.residual_group.blocks.1.mlp.fc12.bias
+ | -0.000 | -0.582 |  0.585 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.1.mlp.fc2.weight
+ | -0.009 | -0.253 |  0.178 |  0.070 | torch.Size([180]) || stage8.3.residual_group.blocks.1.mlp.fc2.bias
+ |  0.941 |  0.262 |  1.154 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.weight
+ | -0.032 | -0.162 |  0.906 |  0.084 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm1.bias
+ | -0.005 | -3.421 |  1.350 |  0.205 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.777 |  0.735 |  0.130 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.000 | -0.355 |  0.421 |  0.092 | torch.Size([540]) || stage8.3.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.000 | -0.479 |  0.475 |  0.115 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.2.attn.proj.weight
+ | -0.013 | -0.292 |  0.345 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.2.attn.proj.bias
+ |  0.743 |  0.242 |  0.919 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.weight
+ | -0.011 | -0.214 |  0.691 |  0.094 | torch.Size([180]) || stage8.3.residual_group.blocks.2.norm2.bias
+ | -0.005 | -0.633 |  0.498 |  0.127 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc11.weight
+ | -0.082 | -0.346 |  0.087 |  0.062 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.591 |  0.670 |  0.134 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.2.mlp.fc12.weight
+ |  0.001 | -0.190 |  0.151 |  0.056 | torch.Size([360]) || stage8.3.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.560 |  0.637 |  0.132 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.2.mlp.fc2.weight
+ | -0.009 | -0.226 |  0.250 |  0.085 | torch.Size([180]) || stage8.3.residual_group.blocks.2.mlp.fc2.bias
+ |  0.950 |  0.250 |  1.103 |  0.086 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.196 |  0.925 |  0.088 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm1.bias
+ | -0.026 | -3.591 |  5.653 |  0.236 | torch.Size([2475, 6]) || stage8.3.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.3.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.753 |  0.637 |  0.128 | torch.Size([540, 180]) || stage8.3.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.000 | -0.333 |  0.432 |  0.081 | torch.Size([540]) || stage8.3.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.001 | -0.591 |  0.591 |  0.118 | torch.Size([180, 180]) || stage8.3.residual_group.blocks.3.attn.proj.weight
+ | -0.014 | -0.348 |  0.267 |  0.122 | torch.Size([180]) || stage8.3.residual_group.blocks.3.attn.proj.bias
+ |  0.735 |  0.254 |  0.893 |  0.082 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.weight
+ | -0.011 | -0.241 |  0.659 |  0.093 | torch.Size([180]) || stage8.3.residual_group.blocks.3.norm2.bias
+ | -0.005 | -0.628 |  0.667 |  0.125 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc11.weight
+ | -0.076 | -0.411 |  0.113 |  0.072 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.662 |  0.578 |  0.135 | torch.Size([360, 180]) || stage8.3.residual_group.blocks.3.mlp.fc12.weight
+ | -0.004 | -0.208 |  0.169 |  0.054 | torch.Size([360]) || stage8.3.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.602 |  0.588 |  0.131 | torch.Size([180, 360]) || stage8.3.residual_group.blocks.3.mlp.fc2.weight
+ | -0.011 | -0.218 |  0.232 |  0.096 | torch.Size([180]) || stage8.3.residual_group.blocks.3.mlp.fc2.bias
+ | -0.000 | -0.343 |  0.316 |  0.065 | torch.Size([180, 180]) || stage8.3.linear.weight
+ |  0.010 | -0.297 |  0.187 |  0.061 | torch.Size([180]) || stage8.3.linear.bias
+ |  1.012 |  0.330 |  1.282 |  0.149 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.weight
+ | -0.030 | -0.347 |  0.800 |  0.134 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm1.bias
+ | -0.013 | -2.816 |  3.792 |  0.236 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.0.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.0.attn.relative_position_index
+ | -0.000 | -0.807 |  0.825 |  0.131 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.0.attn.qkv_self.weight
+ | -0.003 | -0.429 |  0.319 |  0.083 | torch.Size([540]) || stage8.4.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.001 | -0.553 |  0.569 |  0.136 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.0.attn.proj.weight
+ | -0.019 | -0.443 |  0.441 |  0.139 | torch.Size([180]) || stage8.4.residual_group.blocks.0.attn.proj.bias
+ |  0.638 |  0.420 |  0.797 |  0.063 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.weight
+ | -0.018 | -0.222 |  0.886 |  0.107 | torch.Size([180]) || stage8.4.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.576 |  0.510 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc11.weight
+ | -0.018 | -0.277 |  0.123 |  0.068 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc11.bias
+ | -0.000 | -0.687 |  0.625 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.0.mlp.fc12.weight
+ | -0.007 | -0.264 |  0.267 |  0.076 | torch.Size([360]) || stage8.4.residual_group.blocks.0.mlp.fc12.bias
+ |  0.001 | -0.639 |  0.705 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.0.mlp.fc2.weight
+ | -0.012 | -0.255 |  0.274 |  0.095 | torch.Size([180]) || stage8.4.residual_group.blocks.0.mlp.fc2.bias
+ |  1.092 |  0.475 |  1.341 |  0.115 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.weight
+ | -0.030 | -0.294 |  0.686 |  0.113 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm1.bias
+ |  0.018 | -3.165 |  0.990 |  0.213 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.1.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.695 |  0.699 |  0.133 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.002 | -0.319 |  0.286 |  0.075 | torch.Size([540]) || stage8.4.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.001 | -0.542 |  0.519 |  0.133 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.1.attn.proj.weight
+ | -0.017 | -0.439 |  0.451 |  0.152 | torch.Size([180]) || stage8.4.residual_group.blocks.1.attn.proj.bias
+ |  0.664 |  0.366 |  0.835 |  0.074 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.weight
+ | -0.015 | -0.217 |  0.985 |  0.103 | torch.Size([180]) || stage8.4.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.641 |  0.563 |  0.117 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc11.weight
+ | -0.022 | -0.381 |  0.161 |  0.078 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.571 |  0.642 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.1.mlp.fc12.weight
+ |  0.003 | -0.279 |  0.311 |  0.087 | torch.Size([360]) || stage8.4.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.738 |  0.633 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.1.mlp.fc2.weight
+ | -0.007 | -0.254 |  0.261 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.1.mlp.fc2.bias
+ |  1.125 |  0.525 |  1.405 |  0.117 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.186 |  0.627 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm1.bias
+ |  0.028 | -3.477 |  0.957 |  0.217 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.2.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.663 |  0.658 |  0.130 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.2.attn.qkv_self.weight
+ | -0.007 | -0.357 |  0.255 |  0.064 | torch.Size([540]) || stage8.4.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.000 | -0.596 |  0.578 |  0.137 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.2.attn.proj.weight
+ | -0.018 | -0.506 |  0.389 |  0.159 | torch.Size([180]) || stage8.4.residual_group.blocks.2.attn.proj.bias
+ |  0.694 |  0.319 |  0.865 |  0.084 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.weight
+ | -0.018 | -0.150 |  0.975 |  0.087 | torch.Size([180]) || stage8.4.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.619 |  0.565 |  0.116 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc11.weight
+ | -0.025 | -0.345 |  0.208 |  0.086 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc11.bias
+ | -0.000 | -0.624 |  0.607 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.2.mlp.fc12.weight
+ | -0.003 | -0.388 |  0.290 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.2.mlp.fc12.bias
+ | -0.000 | -0.927 |  0.675 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.325 |  0.240 |  0.096 | torch.Size([180]) || stage8.4.residual_group.blocks.2.mlp.fc2.bias
+ |  1.108 |  0.535 |  1.297 |  0.094 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.weight
+ | -0.035 | -0.213 |  0.546 |  0.064 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm1.bias
+ |  0.020 | -3.042 |  1.420 |  0.192 | torch.Size([2475, 6]) || stage8.4.residual_group.blocks.3.attn.relative_position_bias_table
+ | 1237.000 |  0.000 | 2474.000 | 545.607 | torch.Size([384, 384]) || stage8.4.residual_group.blocks.3.attn.relative_position_index
+ | -0.000 | -0.697 |  0.700 |  0.128 | torch.Size([540, 180]) || stage8.4.residual_group.blocks.3.attn.qkv_self.weight
+ | -0.000 | -0.220 |  0.311 |  0.065 | torch.Size([540]) || stage8.4.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.652 |  0.592 |  0.138 | torch.Size([180, 180]) || stage8.4.residual_group.blocks.3.attn.proj.weight
+ | -0.019 | -0.535 |  0.426 |  0.154 | torch.Size([180]) || stage8.4.residual_group.blocks.3.attn.proj.bias
+ |  0.685 |  0.225 |  0.893 |  0.082 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.211 |  0.938 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.norm2.bias
+ | -0.001 | -0.501 |  0.564 |  0.113 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc11.weight
+ | -0.014 | -0.339 |  0.237 |  0.092 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.560 |  0.626 |  0.132 | torch.Size([360, 180]) || stage8.4.residual_group.blocks.3.mlp.fc12.weight
+ |  0.000 | -0.231 |  0.239 |  0.075 | torch.Size([360]) || stage8.4.residual_group.blocks.3.mlp.fc12.bias
+ | -0.000 | -0.544 |  0.657 |  0.130 | torch.Size([180, 360]) || stage8.4.residual_group.blocks.3.mlp.fc2.weight
+ | -0.007 | -0.271 |  0.274 |  0.093 | torch.Size([180]) || stage8.4.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.473 |  0.481 |  0.069 | torch.Size([180, 180]) || stage8.4.linear.weight
+ |  0.029 | -0.333 |  0.194 |  0.076 | torch.Size([180]) || stage8.4.linear.bias
+ |  1.025 |  0.297 |  1.336 |  0.162 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.weight
+ | -0.034 | -0.429 |  0.872 |  0.141 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm1.bias
+ | -0.574 | -4.515 |  3.381 |  0.800 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.771 |  0.886 |  0.125 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.000 | -0.356 |  0.521 |  0.085 | torch.Size([540]) || stage8.5.residual_group.blocks.0.attn.qkv_self.bias
+ | -0.001 | -0.632 |  0.656 |  0.147 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.0.attn.proj.weight
+ | -0.029 | -0.329 |  0.697 |  0.127 | torch.Size([180]) || stage8.5.residual_group.blocks.0.attn.proj.bias
+ |  0.777 |  0.446 |  0.952 |  0.069 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.weight
+ | -0.022 | -0.335 |  0.920 |  0.121 | torch.Size([180]) || stage8.5.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.520 |  0.598 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc11.weight
+ | -0.013 | -0.456 |  0.200 |  0.075 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.677 |  0.642 |  0.137 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.0.mlp.fc12.weight
+ |  0.005 | -0.272 |  0.233 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.0.mlp.fc12.bias
+ |  0.000 | -0.762 |  0.598 |  0.136 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.0.mlp.fc2.weight
+ | -0.025 | -0.244 |  0.583 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.0.mlp.fc2.bias
+ |  1.021 |  0.261 |  1.261 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.weight
+ | -0.033 | -0.358 |  0.867 |  0.120 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm1.bias
+ | -0.550 | -3.274 |  4.406 |  0.670 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.819 |  0.986 |  0.122 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.1.attn.qkv_self.weight
+ |  0.005 | -0.510 |  0.446 |  0.084 | torch.Size([540]) || stage8.5.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.003 | -0.739 |  0.682 |  0.151 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.1.attn.proj.weight
+ | -0.032 | -0.318 |  0.607 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.attn.proj.bias
+ |  0.823 |  0.420 |  0.950 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.weight
+ | -0.021 | -0.274 |  0.882 |  0.111 | torch.Size([180]) || stage8.5.residual_group.blocks.1.norm2.bias
+ | -0.002 | -0.496 |  0.532 |  0.117 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc11.weight
+ | -0.028 | -0.260 |  0.194 |  0.080 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc11.bias
+ |  0.000 | -0.620 |  0.586 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.1.mlp.fc12.weight
+ |  0.004 | -0.284 |  0.423 |  0.083 | torch.Size([360]) || stage8.5.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.774 |  0.614 |  0.137 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.1.mlp.fc2.weight
+ | -0.028 | -0.371 |  0.561 |  0.133 | torch.Size([180]) || stage8.5.residual_group.blocks.1.mlp.fc2.bias
+ |  1.096 |  0.377 |  1.321 |  0.110 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.weight
+ | -0.033 | -0.244 |  0.755 |  0.100 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm1.bias
+ | -0.441 | -3.439 |  5.870 |  0.668 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.2.attn.relative_position_index
+ | -0.000 | -0.710 |  0.679 |  0.123 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.003 | -0.277 |  0.283 |  0.068 | torch.Size([540]) || stage8.5.residual_group.blocks.2.attn.qkv_self.bias
+ |  0.001 | -0.824 |  0.684 |  0.150 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.2.attn.proj.weight
+ | -0.033 | -0.390 |  0.545 |  0.155 | torch.Size([180]) || stage8.5.residual_group.blocks.2.attn.proj.bias
+ |  0.843 |  0.390 |  0.984 |  0.076 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.weight
+ | -0.022 | -0.211 |  0.854 |  0.090 | torch.Size([180]) || stage8.5.residual_group.blocks.2.norm2.bias
+ | -0.002 | -0.522 |  0.503 |  0.116 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc11.weight
+ | -0.024 | -0.243 |  0.219 |  0.091 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc11.bias
+ | -0.001 | -0.638 |  0.617 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.2.mlp.fc12.weight
+ | -0.004 | -0.268 |  0.380 |  0.078 | torch.Size([360]) || stage8.5.residual_group.blocks.2.mlp.fc12.bias
+ |  0.000 | -0.713 |  0.769 |  0.138 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.2.mlp.fc2.weight
+ | -0.034 | -0.372 |  0.592 |  0.151 | torch.Size([180]) || stage8.5.residual_group.blocks.2.mlp.fc2.bias
+ |  1.027 |  0.318 |  1.206 |  0.094 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.weight
+ | -0.033 | -0.187 |  0.768 |  0.088 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm1.bias
+ | -0.347 | -2.664 |  2.684 |  0.528 | torch.Size([225, 6]) || stage8.5.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.5.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.677 |  0.676 |  0.127 | torch.Size([540, 180]) || stage8.5.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.002 | -0.410 |  0.354 |  0.080 | torch.Size([540]) || stage8.5.residual_group.blocks.3.attn.qkv_self.bias
+ |  0.000 | -0.630 |  0.725 |  0.145 | torch.Size([180, 180]) || stage8.5.residual_group.blocks.3.attn.proj.weight
+ | -0.041 | -0.385 |  0.660 |  0.163 | torch.Size([180]) || stage8.5.residual_group.blocks.3.attn.proj.bias
+ |  0.849 |  0.390 |  0.985 |  0.070 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.weight
+ | -0.023 | -0.163 |  0.810 |  0.084 | torch.Size([180]) || stage8.5.residual_group.blocks.3.norm2.bias
+ | -0.002 | -0.547 |  0.536 |  0.115 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc11.weight
+ | -0.012 | -0.366 |  0.252 |  0.106 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc11.bias
+ | -0.000 | -0.669 |  0.597 |  0.139 | torch.Size([360, 180]) || stage8.5.residual_group.blocks.3.mlp.fc12.weight
+ | -0.002 | -0.216 |  0.202 |  0.074 | torch.Size([360]) || stage8.5.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.700 |  0.674 |  0.139 | torch.Size([180, 360]) || stage8.5.residual_group.blocks.3.mlp.fc2.weight
+ | -0.032 | -0.376 |  0.666 |  0.134 | torch.Size([180]) || stage8.5.residual_group.blocks.3.mlp.fc2.bias
+ | -0.001 | -0.299 |  0.469 |  0.069 | torch.Size([180, 180]) || stage8.5.linear.weight
+ |  0.081 | -0.562 |  0.263 |  0.109 | torch.Size([180]) || stage8.5.linear.bias
+ |  1.111 |  0.208 |  1.434 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.weight
+ | -0.048 | -0.547 |  0.851 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm1.bias
+ | -0.252 | -2.157 |  6.293 |  0.490 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.0.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.0.attn.relative_position_index
+ |  0.000 | -0.664 |  0.631 |  0.123 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.0.attn.qkv_self.weight
+ |  0.007 | -0.293 |  0.366 |  0.078 | torch.Size([540]) || stage8.6.residual_group.blocks.0.attn.qkv_self.bias
+ |  0.000 | -0.701 |  0.726 |  0.154 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.0.attn.proj.weight
+ |  0.030 | -0.318 |  0.331 |  0.109 | torch.Size([180]) || stage8.6.residual_group.blocks.0.attn.proj.bias
+ |  0.959 |  0.475 |  1.322 |  0.088 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.weight
+ | -0.039 | -0.421 |  0.873 |  0.151 | torch.Size([180]) || stage8.6.residual_group.blocks.0.norm2.bias
+ | -0.002 | -0.550 |  0.783 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc11.weight
+ |  0.002 | -0.269 |  0.152 |  0.069 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc11.bias
+ |  0.000 | -0.914 |  0.839 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.0.mlp.fc12.weight
+ |  0.001 | -0.340 |  0.304 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.0.mlp.fc12.bias
+ | -0.000 | -0.592 |  0.713 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.0.mlp.fc2.weight
+ |  0.002 | -0.535 |  0.384 |  0.177 | torch.Size([180]) || stage8.6.residual_group.blocks.0.mlp.fc2.bias
+ |  1.123 |  0.183 |  1.352 |  0.165 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.weight
+ | -0.047 | -0.513 |  0.903 |  0.168 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm1.bias
+ | -0.234 | -1.968 |  6.366 |  0.448 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.1.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.1.attn.relative_position_index
+ |  0.000 | -0.751 |  0.759 |  0.121 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.1.attn.qkv_self.weight
+ | -0.001 | -0.300 |  0.214 |  0.061 | torch.Size([540]) || stage8.6.residual_group.blocks.1.attn.qkv_self.bias
+ | -0.000 | -0.657 |  0.699 |  0.148 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.1.attn.proj.weight
+ |  0.031 | -0.321 |  0.293 |  0.115 | torch.Size([180]) || stage8.6.residual_group.blocks.1.attn.proj.bias
+ |  0.986 |  0.416 |  1.360 |  0.096 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.weight
+ | -0.038 | -0.393 |  0.807 |  0.146 | torch.Size([180]) || stage8.6.residual_group.blocks.1.norm2.bias
+ | -0.001 | -0.589 |  0.620 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc11.weight
+ |  0.005 | -0.316 |  0.229 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc11.bias
+ | -0.000 | -0.738 |  0.766 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.1.mlp.fc12.weight
+ |  0.001 | -0.252 |  0.302 |  0.072 | torch.Size([360]) || stage8.6.residual_group.blocks.1.mlp.fc12.bias
+ |  0.000 | -0.674 |  0.629 |  0.140 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.1.mlp.fc2.weight
+ | -0.001 | -0.475 |  0.441 |  0.175 | torch.Size([180]) || stage8.6.residual_group.blocks.1.mlp.fc2.bias
+ |  1.097 |  0.342 |  1.294 |  0.134 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.weight
+ | -0.054 | -0.639 |  0.904 |  0.186 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm1.bias
+ | -0.135 | -3.252 |  1.238 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.2.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.2.attn.relative_position_index
+ |  0.000 | -0.672 |  0.663 |  0.128 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.2.attn.qkv_self.weight
+ |  0.007 | -0.170 |  0.228 |  0.046 | torch.Size([540]) || stage8.6.residual_group.blocks.2.attn.qkv_self.bias
+ | -0.001 | -0.660 |  0.651 |  0.147 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.2.attn.proj.weight
+ |  0.031 | -0.360 |  0.322 |  0.126 | torch.Size([180]) || stage8.6.residual_group.blocks.2.attn.proj.bias
+ |  1.004 |  0.360 |  1.381 |  0.099 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.weight
+ | -0.042 | -0.447 |  0.808 |  0.157 | torch.Size([180]) || stage8.6.residual_group.blocks.2.norm2.bias
+ | -0.000 | -0.600 |  0.603 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc11.weight
+ |  0.022 | -0.447 |  0.249 |  0.086 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc11.bias
+ |  0.000 | -0.666 |  0.708 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.2.mlp.fc12.weight
+ | -0.002 | -0.326 |  0.272 |  0.075 | torch.Size([360]) || stage8.6.residual_group.blocks.2.mlp.fc12.bias
+ | -0.001 | -0.653 |  0.719 |  0.142 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.2.mlp.fc2.weight
+ | -0.011 | -0.488 |  0.321 |  0.153 | torch.Size([180]) || stage8.6.residual_group.blocks.2.mlp.fc2.bias
+ |  1.095 |  0.272 |  1.302 |  0.123 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.weight
+ | -0.052 | -0.557 |  1.069 |  0.192 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm1.bias
+ | -0.196 | -2.349 |  1.401 |  0.360 | torch.Size([225, 6]) || stage8.6.residual_group.blocks.3.attn.relative_position_bias_table
+ | 112.000 |  0.000 | 224.000 | 48.719 | torch.Size([64, 64]) || stage8.6.residual_group.blocks.3.attn.relative_position_index
+ |  0.000 | -0.741 |  0.657 |  0.124 | torch.Size([540, 180]) || stage8.6.residual_group.blocks.3.attn.qkv_self.weight
+ |  0.001 | -0.186 |  0.141 |  0.040 | torch.Size([540]) || stage8.6.residual_group.blocks.3.attn.qkv_self.bias
+ | -0.001 | -0.669 |  0.671 |  0.139 | torch.Size([180, 180]) || stage8.6.residual_group.blocks.3.attn.proj.weight
+ | -0.004 | -0.323 |  0.300 |  0.124 | torch.Size([180]) || stage8.6.residual_group.blocks.3.attn.proj.bias
+ |  0.999 |  0.383 |  1.380 |  0.103 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.weight
+ | -0.044 | -0.392 |  0.694 |  0.163 | torch.Size([180]) || stage8.6.residual_group.blocks.3.norm2.bias
+ |  0.000 | -0.577 |  0.857 |  0.116 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc11.weight
+ |  0.041 | -0.394 |  0.238 |  0.087 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc11.bias
+ |  0.000 | -0.924 |  0.828 |  0.143 | torch.Size([360, 180]) || stage8.6.residual_group.blocks.3.mlp.fc12.weight
+ | -0.003 | -0.214 |  0.407 |  0.071 | torch.Size([360]) || stage8.6.residual_group.blocks.3.mlp.fc12.bias
+ |  0.000 | -0.827 |  0.755 |  0.141 | torch.Size([180, 360]) || stage8.6.residual_group.blocks.3.mlp.fc2.weight
+ |  0.022 | -0.296 |  0.262 |  0.107 | torch.Size([180]) || stage8.6.residual_group.blocks.3.mlp.fc2.bias
+ |  0.002 | -1.059 |  1.262 |  0.089 | torch.Size([180, 180]) || stage8.6.linear.weight
+ |  0.031 | -0.789 |  0.427 |  0.120 | torch.Size([180]) || stage8.6.linear.bias
+ |  0.389 |  0.079 |  1.137 |  0.176 | torch.Size([180]) || norm.weight
+ | -0.021 | -0.669 |  0.888 |  0.127 | torch.Size([180]) || norm.bias
+ |  0.000 | -0.486 |  0.568 |  0.103 | torch.Size([120, 180]) || conv_after_body.weight
+ | -0.000 | -0.167 |  0.168 |  0.055 | torch.Size([120]) || conv_after_body.bias
+ | -0.000 | -1.782 |  1.300 |  0.109 | torch.Size([64, 120, 1, 3, 3]) || conv_before_upsample.0.weight
+ | -0.019 | -0.542 |  0.437 |  0.162 | torch.Size([64]) || conv_before_upsample.0.bias
+ |  0.001 | -1.915 |  1.372 |  0.090 | torch.Size([256, 64, 1, 3, 3]) || upsample.0.weight
+ | -0.045 | -0.281 |  0.215 |  0.097 | torch.Size([256]) || upsample.0.bias
+ | -0.006 | -4.826 |  0.582 |  0.075 | torch.Size([256, 64, 1, 3, 3]) || upsample.5.weight
+ | -0.154 | -0.441 |  0.187 |  0.100 | torch.Size([256]) || upsample.5.bias
+ |  0.000 | -0.210 |  0.246 |  0.012 | torch.Size([64, 64, 1, 3, 3]) || upsample.10.weight
+ |  0.000 | -0.013 |  0.007 |  0.003 | torch.Size([64]) || upsample.10.bias
+ |  0.000 | -0.044 |  0.042 |  0.004 | torch.Size([3, 64, 1, 3, 3]) || conv_last.weight
+ |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([3]) || conv_last.bias
+
+22-03-11 10:53:40.924 :   task: 001_train_vrt_videosr_bi_reds_6frames
+  model: vrt
+  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]
+  dist: False
+  find_unused_parameters: False
+  use_static_graph: True
+  scale: 4
+  n_channels: 3
+  path:[
+    root: experiments
+    pretrained_netG: /home/cll/dev/KAIR/model_zoo/vrt/001_VRT_videosr_bi_REDS_6frames.pth
+    pretrained_netE: None
+    task: experiments/001_train_vrt_videosr_bi_reds_6frames
+    log: experiments/001_train_vrt_videosr_bi_reds_6frames
+    options: experiments/001_train_vrt_videosr_bi_reds_6frames/options
+    models: experiments/001_train_vrt_videosr_bi_reds_6frames/models
+    images: experiments/001_train_vrt_videosr_bi_reds_6frames/images
+    pretrained_optimizerG: None
+  ]
+  datasets:[
+    train:[
+      name: train_dataset
+      dataset_type: VideoRecurrentTrainDataset
+      dataroot_gt: /home/cll/datasets/REDS/train/train_sharp
+      dataroot_lq: /home/cll/datasets/REDS/train/train_sharp_bicubic/X4
+      meta_info_file: data/meta_info/meta_info_REDS_GT.txt
+      filename_tmpl: 08d
+      filename_ext: png
+      val_partition: REDS4
+      test_mode: False
+      io_backend:[
+        type: disk
+      ]
+      num_frame: 4
+      gt_size: 256
+      interval_list: [1]
+      random_reverse: False
+      use_hflip: True
+      use_rot: True
+      dataloader_shuffle: True
+      dataloader_num_workers: 32
+      dataloader_batch_size: 8
+      phase: train
+      scale: 4
+      n_channels: 3
+    ]
+    test:[
+      name: test_dataset
+      dataset_type: VideoRecurrentTestDataset
+      dataroot_gt: /home/cll/Desktop/REDS4/GT
+      dataroot_lq: /home/cll/Desktop/REDS4/sharp_bicubic
+      cache_data: True
+      io_backend:[
+        type: disk
+      ]
+      num_frame: -1
+      phase: test
+      scale: 4
+      n_channels: 3
+    ]
+  ]
+  netG:[
+    net_type: vrt
+    upscale: 4
+    img_size: [6, 64, 64]
+    window_size: [2, 8, 8]
+    depths: [8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4]
+    indep_reconsts: [11, 12]
+    embed_dims: [120, 120, 120, 120, 120, 120, 120, 180, 180, 180, 180, 180, 180]
+    num_heads: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+    spynet_path: model_zoo/vrt/spynet_sintel_final-3d2a1287.pth
+    pa_frames: 2
+    deformable_groups: 12
+    nonblind_denoising: False
+    use_checkpoint_attn: False
+    use_checkpoint_ffn: False
+    no_checkpoint_attn_blocks: []
+    no_checkpoint_ffn_blocks: []
+    init_type: default
+    scale: 4
+  ]
+  train:[
+    G_lossfn_type: charbonnier
+    G_lossfn_weight: 1.0
+    G_charbonnier_eps: 1e-09
+    E_decay: 0
+    G_optimizer_type: adam
+    G_optimizer_lr: 0.0004
+    G_optimizer_betas: [0.9, 0.99]
+    G_optimizer_wd: 0
+    G_optimizer_clipgrad: None
+    G_optimizer_reuse: True
+    fix_iter: 20000
+    fix_lr_mul: 0.125
+    fix_keys: ['spynet', 'deform']
+    total_iter: 300000
+    G_scheduler_type: CosineAnnealingWarmRestarts
+    G_scheduler_periods: 300000
+    G_scheduler_eta_min: 1e-07
+    G_regularizer_orthstep: None
+    G_regularizer_clipstep: None
+    G_param_strict: True
+    E_param_strict: True
+    checkpoint_test: 5000
+    checkpoint_save: 5000
+    checkpoint_print: 200
+    F_feature_layer: 34
+    F_weights: 1.0
+    F_lossfn_type: l1
+    F_use_input_norm: True
+    F_use_range_norm: False
+    G_scheduler_restart_weights: 1
+  ]
+  val:[
+    save_img: False
+    pad_seq: False
+    flip_seq: False
+    center_frame_only: False
+    num_frame_testing: 40
+    num_frame_overlapping: 2
+    size_patch_testing: 128
+  ]
+  opt_path: options/vrt/001_train_vrt_videosr_bi_reds_6frames.json
+  is_train: True
+  merge_bn: False
+  merge_bn_startpoint: -1
+  num_gpu: 8
+  rank: 0
+  world_size: 1
+
+22-03-11 10:53:40.969 : Number of train images: 24,000, iters: 3,000